In [None]:
import timeit
import pandas as pd
import numpy as np
from scipy.stats import entropy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.pipeline import Pipeline
import os
from sklearn.base import BaseEstimator, TransformerMixin
from wordcloud import WordCloud

import string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

from nltk.stem import SnowballStemmer, WordNetLemmatizer

# Clean and tokenize text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords') # download stopwords corpus
nltk.download('punkt') # download punkt tokenizer

# For linear regression
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [None]:
# Define the reaction columns and total reactions column name
reaction_columns = ["like", "wow", "cares", "sad", "angry", "haha"]
total_reactions = "reactions_count"

class EntropyCalculator(BaseEstimator, TransformerMixin):
    def __init__(self, reaction_columns, total_reactions):
        self.reaction_columns = reaction_columns
        self.total_reactions = total_reactions
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Create a copy of the input DataFrame
        X_transformed = X.copy()
        
        # Define a function to calculate the entropy
        def calculate_entropy(row):
            # Get the reaction counts for the row
            counts = row[self.reaction_columns].values
            # Normalize the counts to probabilities
            eps = 1e-8  # a very small positive number
            probabilities = (counts.astype(float) + eps) / (row[self.total_reactions] + len(self.reaction_columns) * eps)
            
            # Calculate the entropy
            return entropy(probabilities)
        
        # Apply the entropy calculation function to each row
        X_transformed["entropy"] = X_transformed.apply(calculate_entropy, axis=1)
        
        return X_transformed


# Custom transformer for descriptive statistics
class DescriptiveStatistics(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        print(X.describe())
        return X

# Custom transformer for plotting descriptive statistics
class DescriptiveStatsPlotter(BaseEstimator, TransformerMixin):
    def __init__(self, figsize=(10, 8)):
        self.figsize = figsize
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Calculate descriptive statistics
        desc_stats = X.describe()
        
        # Plot histogram for each numerical column
        for col in desc_stats.columns:
            if pd.api.types.is_numeric_dtype(X[col]):
                fig, ax = plt.subplots(figsize=self.figsize)
                sns.histplot(data=X, x=col, kde=True, color="skyblue", alpha=0.8)
                average = X[col].mean()
        
                # Add the average line
                ax.axhline(average, color='red', linestyle='--', label='Average')
                ax.set_xlabel(col.capitalize())
                ax.set_ylabel("Frequency")
                ax.set_title(f"Histogram of {col.capitalize()}")
                plt.show()
        
        # Plot boxplot for each numerical column
        for col in desc_stats.columns:
            if pd.api.types.is_numeric_dtype(X[col]):
                fig, ax = plt.subplots(figsize=self.figsize)
                sns.boxplot(data=X, x=col, color="skyblue", width=0.5, fliersize=3)
                ax.set_xlabel(col.capitalize())
                ax.set_ylabel("Value")
                ax.set_title(f"Boxplot of {col.capitalize()}")
                plt.show()
        
        # Plot countplot for each categorical column
        for col in desc_stats.columns:
            if pd.api.types.is_categorical_dtype(X[col]):
                fig, ax = plt.subplots(figsize=self.figsize)
                sns.countplot(data=X, x=col, color="skyblue")
                ax.set_xlabel(col.capitalize())
                ax.set_ylabel("Count")
                ax.set_title(f"Countplot of {col.capitalize()}")
                plt.show()
        
        return X
    
# Transformer for length analysis
class LengthAnalysisTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['word_count'] = X["content"].apply(lambda x: len(str(x).split(" ")))
        X['char_count'] = X["content"].apply(lambda x: sum(len(word) for word in str(x).split(" ")))
        X['sentence_count'] = X["content"].apply(lambda x: len(str(x).split(".")))
        X['avg_word_length'] = X['char_count'] / X['word_count']
        X['avg_sentence_length'] = X['word_count'] / X['sentence_count']
        return X

class PreprocessTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Drop rows with entropy score of 1.3 and higher
        X = X[X['entropy'] < 1.3]
        
        # Delete rows with total reactions count of 30 and lower
        X = X[X['reactions_count'] > 30]
        
        return X

class ColumnAdder(BaseEstimator, TransformerMixin):
    def __init__(self, col1, col2, new_col_name):
        self.col1 = col1
        self.col2 = col2
        self.new_col_name = new_col_name

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Add column based on col1 and col2
        X[self.new_col_name] = X[self.col1] + X[self.col2]
        X = X.drop(columns=[self.col1, self.col2])

        return X

class SourceAppender(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['content'] = X.apply(self.append_source_name, axis=1)
        return X

    def append_source_name(self, df):
        if pd.notnull(df['content']):
            return str(df['content']) + ' ' + df['name']
        else:
            return np.nan

    
 # Download Dutch stopwords if not already downloaded
nltk.download('stopwords')

# Custom transformer for generating word clouds
class WordCloudGenerator(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Get Dutch stopwords
        stopwords_nl = set(stopwords.words('dutch'))

        # Filter the dataframe based on entropy scores
        filtered_texts_low_entropy = X[X['entropy'].between(0.1, 0.7)]['content'].str.cat(sep=' ')
        filtered_texts_high_entropy = X[X['entropy'].between(0.7, 1.2)]['content'].str.cat(sep=' ')

        # Remove stopwords from the filtered texts
        filtered_texts_low_entropy = ' '.join([word for word in filtered_texts_low_entropy.split() if word.lower() not in stopwords_nl])
        filtered_texts_high_entropy = ' '.join([word for word in filtered_texts_high_entropy.split() if word.lower() not in stopwords_nl])

        # Generate word clouds
        wordcloud_low_entropy = WordCloud(background_color='white', colormap='Blues', width=800, height=400).generate(filtered_texts_low_entropy)
        wordcloud_high_entropy = WordCloud(background_color='white', colormap='Reds', width=800, height=400).generate(filtered_texts_high_entropy)

        # Plot the word clouds
        plt.figure(figsize=(10, 5))
        plt.subplot(1, 2, 1)
        plt.imshow(wordcloud_low_entropy, interpolation='bilinear')
        plt.title('Entropy score between 0.1 - 0.7')
        plt.axis('off')

        plt.subplot(1, 2, 2)
        plt.imshow(wordcloud_high_entropy, interpolation='bilinear')
        plt.title('Entropy score between 0.7 - 1.1')
        plt.axis('off')

        plt.tight_layout()
        plt.show()

        # Return the input data unmodified
        return X   

# Custom transformer for analyzing entropy
class EntropyAnalyzer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Sort the DataFrame by entropy in descending order to get the highest entropy posts
        highest_entropy_posts = X.sort_values('entropy', ascending=False).head(5)

        # Sort the DataFrame by entropy in ascending order to get the lowest entropy posts
        lowest_entropy_posts = X.sort_values('entropy', ascending=True).head(5)

        # Print the texts of the highest entropy posts
        print("Texts of news posts with highest entropy:")
        for index, row in highest_entropy_posts.iterrows():
            print(row['entropy'], row['content'])
            print()  # Print an empty line between each post

        # Print the texts of the lowest entropy posts
        print("Texts of news posts with lowest entropy:")
        for index, row in lowest_entropy_posts.iterrows():
            print(row['entropy'], row['content'])
            print()  # Print an empty line between each post

        # Return the input data unmodified
        return X
    
def combine_csv_files(folder_path):
    """Combines CSV files in a folder with identical structures and creates a new ID column."""
    # Get all CSV files in the folder
    csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]
    # Read each CSV file and concatenate them
    df_list = []
    for file in csv_files:
        filepath = os.path.join(folder_path, file)
        df = pd.read_csv(filepath)
        df_list.append(df)
    combined_df = pd.concat(df_list, ignore_index=True)
    # Create a new ID column
    #combined_df['new_id'] = combined_df.index + 1
    # Return the combined dataframe
    return combined_df

class CSVCombiner(BaseEstimator, TransformerMixin):
    def __init__(self, folder_path):
        self.folder_path = folder_path
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        combined_df = combine_csv_files(self.folder_path)
        return combined_df
    
class CSVWriter(BaseEstimator, TransformerMixin):
    def __init__(self, file_path):
        self.file_path = file_path
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.to_csv(self.file_path, index=False)
        return X
    
# Custom transformer for grouping and plotting entropy scores by news source
class EntropyScorePlotter(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        # Group the data by news source
        grouped_df = X.groupby('name')

        # Iterate over each news source
        for source, group in grouped_df:
            # Plot the histogram of entropy scores for the current news source
            plt.figure(figsize=(8, 6))
            plt.hist(group['entropy'], bins=10, edgecolor='black', alpha=0.75)
            plt.xlabel('Entropy Score')
            plt.ylabel('Frequency')
            plt.title(f'{source}')
            plt.xticks(rotation=45)
            plt.show()

        return X
    
# Transformer for reaction analysis    
class ReactionAnalysisTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        
        # Group the DataFrame by 'name' column (news source) and calculate the sum of Facebook reactions
        reaction_distribution = X.groupby('name')[['like', 'wow', 'cares', 'sad', 'angry', 'haha']].sum()
        reaction_distribution['total'] = reaction_distribution.sum(axis=1)

        # Calculate the total likes, total wows, etc.
        total_likes = reaction_distribution['like'].sum()
        total_wows = reaction_distribution['wow'].sum()
        total_cares = reaction_distribution['cares'].sum()
        total_sads = reaction_distribution['sad'].sum()
        total_angrys = reaction_distribution['angry'].sum()
        total_hahas = reaction_distribution['haha'].sum()

        # Print the reaction distribution per news source
        print("Reaction Distribution per News Source:")
        print(reaction_distribution)
        print()

        # Print the total likes, total wows, etc.
        print('Total Likes:', total_likes)
        print('Total Wows:', total_wows)
        print('Total Cares:', total_cares)
        print('Total Sads:', total_sads)
        print('Total Angrys:', total_angrys)
        print('Total Hahas:', total_hahas)
        print()

        # Calculate the total reactions for each reaction type
        total_reactions = reaction_distribution.sum()

        # Calculate the percentage of each reaction type of the total reactions
        reaction_distribution['like_percentage'] = (reaction_distribution['like'] / total_reactions['like']) * 100
        reaction_distribution['wow_percentage'] = (reaction_distribution['wow'] / total_reactions['wow']) * 100
        reaction_distribution['cares_percentage'] = (reaction_distribution['cares'] / total_reactions['cares']) * 100
        reaction_distribution['sad_percentage'] = (reaction_distribution['sad'] / total_reactions['sad']) * 100
        reaction_distribution['angry_percentage'] = (reaction_distribution['angry'] / total_reactions['angry']) * 100
        reaction_distribution['haha_percentage'] = (reaction_distribution['haha'] / total_reactions['haha']) * 100

        # Print the total reactions
        print("Total Reactions:")
        print(total_reactions)
        print()

        # Print the percentage of each reaction type of the total reactions
        print('Like Percentage:', (total_reactions['like'] / reaction_distribution['total'].sum()) * 100)
        print('Wow Percentage:', (total_reactions['wow'] / reaction_distribution['total'].sum()) * 100)
        print('Cares Percentage:', (total_reactions['cares'] / reaction_distribution['total'].sum()) * 100)
        print('Sad Percentage:', (total_reactions['sad'] / reaction_distribution['total'].sum()) * 100)
        print('Angry Percentage:', (total_reactions['angry'] / reaction_distribution['total'].sum()) * 100)
        print('Haha Percentage:', (total_reactions['haha'] / reaction_distribution['total'].sum()) * 100)

        # Return the input data unmodified
        return X

class TextCleaner(BaseEstimator, TransformerMixin):
    def __init__(self, stem=True, lemma=True, remove_special=True):
        self.stem = stem
        self.lemma = lemma
        self.remove_special = remove_special
        self.stop_words = set(stopwords.words('dutch'))
        self.stemmer = SnowballStemmer('dutch')
        self.lemmatizer = WordNetLemmatizer()
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X.dropna(subset=['content'], inplace=True)
        
        # Remove special characters and numbers
        if self.remove_special:
            X['prep_content'] = X['content'].apply(lambda text: self._remove_special_chars_numbers(text))
        
        # Convert text to lowercase
        X['prep_content'] = X['content'].apply(lambda text: text.lower())
        
        # Tokenize text
        X['prep_content'] = X['content'].apply(lambda text: word_tokenize(text))
        
        # Remove stop words
        X['prep_content'] = X['prep_content'].apply(lambda tokens: [token for token in tokens if not token in self.stop_words])
        
        # Apply stemming
        if self.stem:
            X['prep_content'] = X['prep_content'].apply(lambda tokens: [self.stemmer.stem(token) for token in tokens])
        
        # Apply lemmatization
        if self.lemma:
            X['prep_content'] = X['prep_content'].apply(lambda tokens: [self.lemmatizer.lemmatize(token) for token in tokens])
        
        # Join tokens back into text
        X['prep_content'] = X['prep_content'].apply(lambda tokens: " ".join(tokens))
        
        return X
    
    def _remove_special_chars_numbers(self, text):
        # Remove special characters and numbers using string.punctuation
        text = text.translate(str.maketrans("", "", string.punctuation + string.digits))
        return text

    
# Create the pipeline
pipeline = Pipeline([
    # Comment if you want to read in a csv file separately
    ('csv_combiner', CSVCombiner(folder_path='INPUT FOLDER PATH')),
    ('Append_Columns', ColumnAdder(col1='likes', col2='loves', new_col_name='like')),
    ('entropy_calculator', EntropyCalculator(reaction_columns, total_reactions)),
    ('word_cloud_generator', WordCloudGenerator()),
    ('length_analysis', LengthAnalysisTransformer()),
    ('descriptive_stats', DescriptiveStatistics()),
    ('descriptive_stats_plotter', DescriptiveStatsPlotter()),
    ('entropy_analyzer', EntropyAnalyzer()),
    ('Append Source name',SourceAppender() ),
    ('entropy_score_plotter', EntropyScorePlotter()),  # Add this step
    ('reaction_analysis', ReactionAnalysisTransformer()),  # Add this step
    ('Preprocess', PreprocessTransformer()),
    ('TextCleaner', TextCleaner(stem=True, lemma=True, remove_special=True)),
    # Comment if you do not want to write to csv
    ('csv_writer', CSVWriter(file_path='INPUT FILE PATH')),

])

# Apply the pipeline to your DataFrame
df_transformed = pipeline.fit_transform(df)