<a href="https://colab.research.google.com/github/jimenasalinas/RedditGoesGreen/blob/main/models/Word2Vec_final_threshold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import pandas as pd
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import numpy as np
from google.colab import drive
from datasets import Dataset
import random
import csv
import nltk
from nltk.corpus import stopwords
from gensim.models import KeyedVectors
from sklearn.metrics import f1_score

In [4]:
drive.mount('/content/drive')

# Load data
data_path = "/content/drive/My Drive/group_project/archive/"

Mounted at /content/drive


In [None]:
!pip install datasets

In [5]:
# Threshold data
dtype_dict = {'label': int}
comments = pd.read_csv(data_path + "comments_filtered_by_threshold.csv", quoting=csv.QUOTE_NONNUMERIC, dtype=dtype_dict)

In [6]:
# Split the data into features and target label
X = comments.drop('label', axis=1)  # Features (all columns except 'label')
y = comments['label']  # Target label

# Splitting the dataset into training and testing sets with 80% training data and 20% testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=120938)


In [7]:
# Number of observations for the smaller datasets
train_obs = 15000
test_obs = 1500

# Training set
# Shuffling DataFrame
X_train_shuffled = X_train.sample(n=len(X_train), random_state=42).reset_index(drop=True)
y_train_shuffled = y_train.sample(n=len(y_train), random_state=42).reset_index(drop=True)
# Selecting the first 'train_obs' samples
small_X_train = X_train_shuffled.head(train_obs)
small_y_train = y_train_shuffled.head(train_obs)

# Testing set
# Shuffling DataFrame
X_test_shuffled = X_test.sample(n=len(X_test), random_state=42).reset_index(drop=True)
y_test_shuffled = y_test.sample(n=len(y_test), random_state=42).reset_index(drop=True)
# Selecting the first 'test_obs' samples
small_X_test = X_test_shuffled.head(test_obs)
small_y_test = y_test_shuffled.head(test_obs)

In [None]:
# Preprocessing
nltk.download('stopwords')
stop_words = stopwords.words('english')

# Function to remove stopwords
def remove_stopwords(transcript):
    words = transcript.split()
    filtered_words = [word for word in words if word not in stop_words]
    return ' '.join(filtered_words)

def clean_transcript(df):
    df['clean_transcript'] = df['body'].str.lower().str.replace(r'[^\w\s]', '', regex=True)
    df['clean_transcript'] = df['clean_transcript'].astype(str).apply(remove_stopwords)
    df.dropna(subset=['clean_transcript'], inplace=True)
    return df

# Apply cleaning and stopword removal to your dataset
small_X_train = clean_transcript(small_X_train)
small_X_test = clean_transcript(small_X_test)

# Tokenize the cleaned transcripts
small_X_train['tokenized'] = small_X_train['clean_transcript'].apply(lambda x: x.split())
small_X_test['tokenized'] = small_X_test['clean_transcript'].apply(lambda x: x.split())

In [9]:
# Train a Word2Vec model
model = Word2Vec(sentences=small_X_train['tokenized'], vector_size=300, window=5, min_count=1, workers=4)

# Generate a feature vector for each document
def comment_vector(comment):
    # remove out-of-vocabulary words
    comment = [word for word in comment if word in model.wv.index_to_key]
    if not comment:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[comment], axis=0)

small_X_train['doc_vector'] = small_X_train['tokenized'].apply(comment_vector)
small_X_test['doc_vector'] = small_X_test['tokenized'].apply(comment_vector)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_X_train['doc_vector'] = small_X_train['tokenized'].apply(comment_vector)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_X_test['doc_vector'] = small_X_test['tokenized'].apply(comment_vector)


In [10]:
X_train_vectors = np.stack(small_X_train['doc_vector'].values)
X_test_vectors = np.stack(small_X_test['doc_vector'].values)

In [12]:
# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Fit the model
lr_model.fit(X_train_vectors, small_y_train)

# Predict on the test set
lr_predictions = lr_model.predict(X_test_vectors)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(small_y_test, lr_predictions))
print("F1 score:", f1_score(small_y_test, lr_predictions))


Logistic Regression Accuracy: 0.6213333333333333
F1 score: 0.5977337110481586


In [13]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train_vectors, small_y_train)

# Predict on the test set
rf_predictions = rf_model.predict(X_test_vectors)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(small_y_test, rf_predictions))
print("Classification Report:", classification_report(small_y_test, rf_predictions))

Random Forest Accuracy: 0.5946666666666667
Classification Report:               precision    recall  f1-score   support

           0       0.59      0.70      0.64       767
           1       0.61      0.48      0.54       733

    accuracy                           0.59      1500
   macro avg       0.60      0.59      0.59      1500
weighted avg       0.60      0.59      0.59      1500



In [14]:
from sklearn.svm import SVC

svm_model = SVC(kernel='linear')
svm_model.fit(X_train_vectors, small_y_train)
svm_predictions = svm_model.predict(X_test_vectors)

print("SVM Accuracy:", accuracy_score(small_y_test, svm_predictions))
print("Classification Report:", classification_report(small_y_test, svm_predictions))

SVM Accuracy: 0.6286666666666667
Classification Report:               precision    recall  f1-score   support

           0       0.63      0.68      0.65       767
           1       0.63      0.57      0.60       733

    accuracy                           0.63      1500
   macro avg       0.63      0.63      0.63      1500
weighted avg       0.63      0.63      0.63      1500



In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Defining the model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_vectors.shape[1],)),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(X_train_vectors, small_y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_vectors, small_y_test)
print("Neural Network Accuracy:", accuracy)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.5773333311080933


**Pretrained model**

In [13]:
# Path to the Google News Word2Vec model
corpus_path = data_path + "GoogleNews-vectors-negative300.bin.gz"

# Load the pre-trained Word2Vec model
model_pretrained = KeyedVectors.load_word2vec_format(corpus_path, binary=True)

# Generate a feature vector for each document
def comment_vector(comment):
    # Remove out-of-vocabulary words
    comment = [word for word in comment if word in model_pretrained.key_to_index]
    if not comment:
        return np.zeros(model_pretrained.vector_size)
    # Use model_pretrained directly to get word vectors
    return np.mean([model_pretrained[word] for word in comment], axis=0)

small_X_train['doc_vector_pretrained'] = small_X_train['tokenized'].apply(comment_vector)
small_X_test['doc_vector_pretrained'] = small_X_test['tokenized'].apply(comment_vector)

X_train_vectors_pretrained = np.stack(small_X_train['doc_vector_pretrained'].values)
X_test_vectors_pretrained = np.stack(small_X_test['doc_vector_pretrained'].values)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_X_train['doc_vector_pretrained'] = small_X_train['tokenized'].apply(comment_vector)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  small_X_test['doc_vector_pretrained'] = small_X_test['tokenized'].apply(comment_vector)


In [14]:
# Initialize the Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)

# Fit the model
lr_model.fit(X_train_vectors_pretrained, small_y_train)

# Predict on the test set
lr_predictions = lr_model.predict(X_test_vectors_pretrained)

# Evaluate the model
print("Logistic Regression Accuracy:", accuracy_score(small_y_test, lr_predictions))
print("F1 Score:", f1_score(small_y_test, lr_predictions))

Logistic Regression Accuracy: 0.7273333333333334
F1 Score: 0.7181254307374225


In [18]:
# Initialize the Random Forest Classifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Fit the model
rf_model.fit(X_train_vectors_pretrained, small_y_train)

# Predict on the test set
rf_predictions = rf_model.predict(X_test_vectors_pretrained)

# Evaluate the model
print("Random Forest Accuracy:", accuracy_score(small_y_test, rf_predictions))
print("Classification Report:", classification_report(small_y_test, rf_predictions))

Random Forest Accuracy: 0.666
Classification Report:               precision    recall  f1-score   support

           0       0.65      0.76      0.70       767
           1       0.69      0.57      0.62       733

    accuracy                           0.67      1500
   macro avg       0.67      0.66      0.66      1500
weighted avg       0.67      0.67      0.66      1500



In [19]:
# Defining the model
model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_vectors_pretrained.shape[1],)),
    Dense(256, activation='relu'),
    Dense(1, activation='sigmoid')
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Fit the model
model.fit(X_train_vectors_pretrained, small_y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = model.evaluate(X_test_vectors_pretrained, small_y_test)
print("Neural Network Accuracy:", accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Neural Network Accuracy: 0.6666666865348816
