In [57]:
import timeit
import pandas as pd
import numpy as np
from scipy.stats import entropy
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import os

# Clean and tokenize text
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string

nltk.download('stopwords') # download stopwords corpus
nltk.download('punkt') # download punkt tokenizer

# For linear regression
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

from sklearn.feature_extraction.text import TfidfVectorizer


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Hannah/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/Hannah/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## TRANSFORMERS

In [None]:
%cd /Users/Hannah/Documents/VU/Msc/Thesis/Coding/Pipeline

In [None]:
df = pd.read_csv('Indicator-Desc_DataNew.csv')

In [None]:
from transformers import AutoTokenizer, AutoModel, TFAutoModel

In [None]:
tokenizer = AutoTokenizer.from_pretrained("GroNLP/bert-base-dutch-cased")
model = TFAutoModel.from_pretrained("GroNLP/bert-base-dutch-cased")  # Tensorflow


In [None]:
# Initialize an empty array to store the embeddings
embeddings = np.zeros((len(df), 768))

# Loop through the dataframe column and generate embeddings
for i, text in enumerate(df['content']):
    # Encode the text using the tokenizer
    input_ids = tokenizer.encode(str(text), add_special_tokens=True, return_tensors="tf")
    # Generate the embeddings using the model
    output = model(input_ids)
    # Extract the embeddings from the output and flatten using max-pooling
    embeddings[i] = np.max(output.last_hidden_state.numpy(), axis=1)
    
# Add the embeddings to the dataframe
for j in range(768):
    df[f'embedding_{j}'] = embeddings[:, j]


In [None]:
# Add the embeddings to the dataframe
df['embeddings'] = list(embeddings)

In [None]:
df.to_csv('transformers_dataNew.csv', index=False)


## TF IDF

In [None]:
%cd /Users/Hannah/Documents/VU/Msc/Thesis/Coding/Pipeline

In [None]:
df = pd.read_csv('Indicator-Desc_Data.csv')

In [None]:
def clean_and_tokenize(text):
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('dutch'))
    tokens = [token for token in tokens if not token in stop_words]
    
    # Join the tokens back into a single string
    text = " ".join(tokens)
    
    return tokens

In [None]:
df['content'] = df['content'].astype(str) # convert column to string data type
df['prep_content'] = df['content'].apply(clean_and_tokenize)
df['prep_content'] = df['prep_content'].astype(str) # convert column to string data type


In [None]:
## Tf-Idf (advanced variant of BoW)
vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1,2))

# Fit and transform the vectorizer on the text column
X = vectorizer.fit_transform(df['prep_content'])

# Convert the sparse matrix to a dense numpy array
X_array = X.toarray()

# Create a new dataframe with the tf-idf features and the original column names
tfidf_df = pd.DataFrame(X_array)

# Add the new dataframe as columns to the original dataframe
df = pd.concat([df, tfidf_df], axis=1)


## GLOVE

In [None]:
import numpy as np
import fasttext as ft
from sklearn.linear_model import LinearRegression
import gensim.downloader as api
from gensim import corpora
from gensim import models
from gensim.models import KeyedVectors



In [None]:
glove_model = gensim.models.KeyedVectors.load_word2vec_format('glove.6B.50d.txt', binary=False)

In [None]:
def get_text_embeddings(texts, glove_model):
    """
    Convert input texts into a matrix of GloVe embeddings.
    """
    embeddings = []
    for text in texts:
        # get GloVe embedding for each word
        word_embeddings = []
        for word in text:
            # get GloVe embedding for the word
            if word in glove_model:
                word_embedding = glove_model[word]
                word_embeddings.append(word_embedding)
        # combine word embeddings into a single sentence embedding
        if word_embeddings:
            sentence_embedding = np.mean(word_embeddings, axis=0)
        else:
            sentence_embedding = np.zeros(glove_model.vector_size)
        embeddings.append(sentence_embedding)
    return np.array(embeddings)

## Linear Regression

In [63]:
#df = pd.read_csv('transformers_data-High_RC.csv') # To read in the csv file that only has posts > 40 reactions
df = pd.read_csv('transformers_dataNew.csv') # To read in the csv file with all posts
df = df[df['reactions_count'] > 30]

In [65]:
# split into train and test sets
# Split the data into X and y
X = df.iloc[:, -769:-1]
#X = df['prep_content']
df.dropna(subset=['entropy'], inplace=True)
y = df['entropy']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [66]:
# get text embeddings for train and test sets
#X_train_embeddings = get_text_embeddings(X_train, glove_model)
#X_test_embeddings = get_text_embeddings(X_test, glove_model)
# define and train a linear regression model
model = LinearRegression()
model.fit(X_train, y_train)

# make predictions on the test set
y_pred = model.predict(X_test)

# evaluate model performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# ADD LOSS FUNCTION / INSIGHTS? maybe check overfitting with validation set. 
print(f'Mean Squared Error: {mse:.4f}')
print(f'Mean Absolute Error: {mae:.2f}')
print(f'R-squared: {r2:.2f}')

Mean Squared Error: 0.1211
Mean Absolute Error: 0.28
R-squared: -0.03


In [61]:
y_pred

array([0.5220547 , 0.41011461, 0.50753803, ..., 0.46926745, 0.2164918 ,
       0.60698682])

## Explainer

In [None]:
from lime.lime_text import LimeTextExplainer

# Convert word embeddings to text representation
def embeddings_to_text(embeddings):
    # Convert embeddings to a string representation
    text = ' '.join([str(val) for val in embeddings])
    return text

# Define LimeTextExplainer
explainer = LimeTextExplainer()

# Convert the sample's word embeddings to text representation
sample_text = X.apply(embeddings_to_text, axis=1)
#sample_text = embeddings_to_text(sample)

In [None]:
# Select a sample from your data
sample = sample_text.values[0]

In [None]:
from lime.lime_text import LimeTextExplainer

# Convert word embeddings to text representation
def embeddings_to_text(embeddings):
    # Convert embeddings to a string representation
    text = ' '.join([str(val) for val in embeddings])
    return text

# Define class names for regression
class_names = ['entropy']

# Define LimeTextExplainer
explainer = LimeTextExplainer()


# Convert the sample's word embeddings to text representation
sample_text = X.apply(embeddings_to_text, axis=1)
#sample_text = embeddings_to_text(sample)

sample_str = sample_text.values[0]  # Assuming 'sample_str' is a string
sample_array = np.array(sample_str.split(','), dtype=float)  # Split the string and convert substrings to floats
sample_reshaped = sample_array.reshape(1, -1)  # Reshape the array as needed


# Explain the instance using LimeTextExplainer
exp = explainer.explain_instance(sample_reshaped, model.predict, num_features=6)

# Generate the explanation in HTML format
html = exp.as_html()


In [None]:
from lime.lime_text import LimeTextExplainer

# Define LimeTextExplainer
explainer = LimeTextExplainer()

# Select a random instance to explain
#instance_idx = np.random.randint(len(X))
instance = X[5]
exp = explainer.explain_instance(instance, model.predict, num_features=10)

# Print the explanation
print(exp.as_list())
