In [1]:
#!pip install --upgrade networkx==2.6
#!pip install --upgrade scipy==1.8.0
#!pip install praw
#!pip install stanza
#!pip install --upgrade scikit-learn
#!pip install wordcloud
#!pip install scikit-optimize
#pip install xgboost

In [1]:
import re
import os
import io
import praw
import nltk
import time
import math
import stanza
import string

from xgboost import XGBClassifier

import numpy as np
import pandas as pd
import datetime as dt
#use scipy v1.8.0, networkx v2.6
import networkx as nx 

from time import time
from heapq import nlargest

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

from sklearn import svm
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score,  classification_report, confusion_matrix, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud
from nltk.tokenize import word_tokenize
from string import punctuation

import warnings
warnings.filterwarnings("ignore")

<h3>2.1. Data Cleaning and Splitting</h3>

In [4]:
#load data
df = pd.read_csv('to_label.csv')
df_2 = pd.read_csv('to_label_2.csv')
df_3 = pd.read_csv('to_label_3.csv')
df_4 = pd.read_csv('to_label_4.csv')
df_5 = pd.read_csv('to_label_4.csv')

#merge all labelled dataframes
df = pd.concat([df, df_2, df_3, df_4, df_5], axis = 0).reset_index(drop = True)

In [5]:
#data cleaning
pattern = r'^https?:\/\/.*[\r\n]*'
#remove comments with only hyperlinks
df = df[~df['content'].str.contains(pattern)]
#remove comments which were deleted
df = df.loc[df['content'] != '[deleted]']

In [6]:
#define some more stop words ontop of the default stop word list
stop_words = set(stopwords.words('english'))
#stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

# Function to preprocess text
def preprocess_text(text):
    #define regex for urls
    url_pattern = re.compile(r'https?://\S+|www\.\S+')
    text = url_pattern.sub('', text) # Remove URLs
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    tokens = nltk.word_tokenize(text)
    # lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    # Return cleaned text after removing Nones
    tokens = list(filter(lambda x: x is not None, tokens))
    return ' '.join(tokens)

In [7]:
#preprocess all comments/posts into new column
df['cleaned_content'] = df['content'].apply(preprocess_text)

In [8]:
#nltk.download('averaged_perceptron_tagger')
# Split comments into sentences and perform POS tagging on each sentence
sentences = [nltk.sent_tokenize(content) for content in df['cleaned_content']]
pos_tagged_sentences = [[nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in comment] for comment in sentences]

# Extract POS features for each comment; idea is to count the number of tokens belonging to each
#pos tag within each sentence
pos_features = []
for content in pos_tagged_sentences:
    pos_dict = {}
    for sentence in content:
        for word, tag in sentence:
            pos_dict[tag] = pos_dict.get(tag, 0) + 1
    pos_features.append(pos_dict)
    
# Convert POS features to a sparse matrix
pos_vectorizer = DictVectorizer(sparse=False)
pos_features = pos_vectorizer.fit_transform(pos_features)

# Define a function for stemming
stemmer = nltk.stem.PorterStemmer()
def tokenize_and_stem(text):
    tokens = nltk.word_tokenize(text)
    stems = [stemmer.stem(token) for token in tokens]
    return stems

#fit tfidf_vectorizer with corpus
tfidf = TfidfVectorizer(
    stop_words = 'english',
    ngram_range = (1, 2),
    tokenizer = tokenize_and_stem,
    min_df = 2, #ignore if ngram appears in less than 2 documents
    max_df = 0.9 #ignore if ngram appears in more than 89% of the documents
)

tfidf_features = tfidf.fit_transform(df['cleaned_content'])

#fit countvectorizer with corpus
bow = CountVectorizer()
bow_features = bow.fit_transform(df['cleaned_content'])

#combine tfidf, bow  and pos features into a single array
tfidf_pos = np.concatenate((tfidf_features.toarray(), bow_features.toarray(), pos_features), axis = 1)

#add score to the array
tfidf_pos = np.c_[tfidf_pos, df['score']]

#specify encoding sequence
sequence = ['negative', 'neutral', 'positive']

#load label encoder
le = LabelEncoder()

#encode labels
labels = le.fit(sequence).transform(df['sentiment'].str.lower())

In [9]:
#split into train and test set
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_pos,
    labels, 
    test_size=0.2, 
    random_state=42
)

<h3> 2.2. Random Forest Tuning </h3>

In [15]:
# initialize randomforest classifier
classifier = RandomForestClassifier(random_state=42)

#gridsearch to determine optimal n_estimator with cross-validation; best params: max_features= sqrt, n_estimators = 80
params_rf = {
    'n_estimators': list(range(30, 100, 10)),
    'max_features': ['sqrt', 'log2']
}

gs = BayesSearchCV(
    n_jobs = -1,
    estimator = classifier,
    search_spaces = params_rf,
    cv = 5,
    scoring = 'f1_weighted',
    random_state= 42
)

#start time
start = time()

#fit
gs.fit(X_train, y_train)

BayesSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42),
              n_jobs=-1, random_state=42, scoring='f1_weighted',
              search_spaces={'max_features': ['sqrt', 'log2'],
                             'n_estimators': [30, 40, 50, 60, 70, 80, 90]})

In [16]:
# Print the best parameters and test F1 score
print(f"Best parameters: {gs.best_params_}")

#score
y_pred = gs.predict(X_test)
print(f"took {time() - start:.3f} seconds")

#confusion Matrix
cf = pd.DataFrame(
        confusion_matrix(y_test, y_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

Best parameters: OrderedDict([('max_features', 'sqrt'), ('n_estimators', 90)])
took 314.822 seconds
               predicted neutral  predicted negative  predicted positive
true neutral                  41                  66                  11
true negative                  5                 206                  17
true positive                  2                  40                  86

Classification report:
              precision    recall  f1-score   support

           0       0.85      0.35      0.49       118
           1       0.66      0.90      0.76       228
           2       0.75      0.67      0.71       128

    accuracy                           0.70       474
   macro avg       0.76      0.64      0.66       474
weighted avg       0.73      0.70      0.68       474



<h3> 2.3. Logistic Regression Tuning </h3>

In [12]:
penalty_pairs = {
    'none': 'none',
    'l1': 'liblinear',
    'l2': 'lbfgs'
}

#define parameters
params_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
}
best_model_lr = ""
best_f1 = 0

#iterate through each penalty and solver pair
for p, s in penalty_pairs.items():
    if p == 'none':
        logreg = LogisticRegression(penalty = 'none')
    else:
        logreg = LogisticRegression(penalty = p, solver = s)

    gs_logreg = BayesSearchCV(
        n_jobs = -1,
        estimator = logreg,
        search_spaces = params_lr,
        cv = 5,
        scoring = 'f1_weighted',
        random_state= 42
    )

    #start time
    start = time()
    #fit gridsearch to training data
    gs_logreg.fit(X_train, y_train)

    # Print the best parameters and test F1 score
    print(f"Best parameters: {gs_logreg.best_params_}")


    #score
    y_pred = gs_logreg.predict(X_test)
    
    f1_test = f1_score(y_test, y_pred, average = 'weighted')

    #weighted - puts more emphasis on classes with lower data points
    print(f"[penalty type - {p}] f1 test: {f1_test} | took {time() - start:.3f} seconds")
    
    if f1_test > best_f1:
        best_model_lr = gs_logreg
        best_f1 = f1_test

Best parameters: OrderedDict([('C', 0.1)])
[penalty type - none] f1 test: 0.5148194539186893 | took 496.028 seconds
Best parameters: OrderedDict([('C', 10.0)])
[penalty type - l1] f1 test: 0.6749020729219407 | took 33.964 seconds
Best parameters: OrderedDict([('C', 100.0)])
[penalty type - l2] f1 test: 0.5492880375397172 | took 492.680 seconds


In [13]:
# Print the best parameters and test F1 score
print(f"Best parameters: {best_model_lr.best_params_}")

#score
y_pred = best_model_lr.predict(X_test)
print(f"took {time() - start:.3f} seconds")

#confusion Matrix
cf = pd.DataFrame(
        confusion_matrix(y_test, y_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

Best parameters: OrderedDict([('C', 10.0)])
took 492.699 seconds
               predicted neutral  predicted negative  predicted positive
true neutral                  56                  45                  17
true negative                 20                 183                  25
true positive                 14                  30                  84

Classification report:
              precision    recall  f1-score   support

           0       0.62      0.47      0.54       118
           1       0.71      0.80      0.75       228
           2       0.67      0.66      0.66       128

    accuracy                           0.68       474
   macro avg       0.67      0.64      0.65       474
weighted avg       0.68      0.68      0.67       474



<h3> 2.4. XGBoost Tuning </h3>

In [23]:
#start time
start = time()

#train the model
xgbm = XGBClassifier(n_estimators = 100)

#define parameters; result: best max_depth is 9, learning rate is 0.9
params_xgbm = {
    'max_depth': Integer(3, 10),
    'learning_rate': Real(0.001, 1.0, prior='log-uniform'),
    'reg_alpha': Real(1e-9, 1.0, prior='log-uniform'),
}

gs_xgbm = BayesSearchCV(
    n_jobs = -1,
    estimator = xgbm,
    search_spaces = params_xgbm,
    cv = 5,
    scoring = 'f1_weighted',
    random_state= 42
).fit(X_train, y_train)

In [30]:
# Print the best parameters and test F1 score
print(f"Best parameters: {gs_xgbm.best_params_}")

#score
y_pred = gs_xgbm.predict(X_test)
print(f"took {time() - start:.3f} seconds")

#confusion Matrix
cf = pd.DataFrame(
        confusion_matrix(y_test, y_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

Best parameters: OrderedDict([('learning_rate', 0.9893064224892779), ('max_depth', 3), ('reg_alpha', 0.0205428306991576)])
took 7492.519 seconds
               predicted neutral  predicted negative  predicted positive
true neutral                  62                  39                  17
true negative                 27                 176                  25
true positive                 14                  21                  93

Classification report:
              precision    recall  f1-score   support

           0       0.60      0.53      0.56       118
           1       0.75      0.77      0.76       228
           2       0.69      0.73      0.71       128

    accuracy                           0.70       474
   macro avg       0.68      0.67      0.68       474
weighted avg       0.69      0.70      0.70       474



<h3> 2.5. MLP Tuning </h3>

In [21]:
#initialize model
mlp = MLPClassifier(random_state=42, learning_rate = 'adaptive')

#define parameters
params_mlp = {
    'hidden_layer_sizes': [10, 20, 30, 60, 120],
    'activation': ['relu', 'tanh', 'logistic']
}

gs_mlp = BayesSearchCV(
    estimator = mlp, 
    search_spaces = params_mlp, 
    cv=5, 
    n_jobs = -1, 
    random_state= 42
)

#start time
start = time()

#fit gridsearch to training data
gs_mlp.fit(X_train, y_train)

BayesSearchCV(cv=5,
              estimator=MLPClassifier(learning_rate='adaptive',
                                      random_state=42),
              n_jobs=-1, random_state=42,
              search_spaces={'activation': ['relu', 'tanh', 'logistic'],
                             'hidden_layer_sizes': [10, 20, 30, 60, 120]})

In [22]:
# Print the best parameters and test F1 score
print(f"Best parameters: {gs_mlp.best_params_}")

#score
y_pred = gs_mlp.predict(X_test)
print(f"took {time() - start:.3f} seconds")

#confusion Matrix
cf = pd.DataFrame(
        confusion_matrix(y_test, y_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )
print(cf)

print("\nClassification report:")
print(classification_report(y_test, y_pred))

Best parameters: OrderedDict([('activation', 'logistic'), ('hidden_layer_sizes', 20)])
took 3926.872 seconds
               predicted neutral  predicted negative  predicted positive
true neutral                  61                  37                  20
true negative                 36                 163                  29
true positive                 14                  28                  86

Classification report:
              precision    recall  f1-score   support

           0       0.55      0.52      0.53       118
           1       0.71      0.71      0.71       228
           2       0.64      0.67      0.65       128

    accuracy                           0.65       474
   macro avg       0.63      0.63      0.63       474
weighted avg       0.65      0.65      0.65       474



<h3> 2.6. Compare against Pre-tuned Models</h3>

In [29]:
#check against vader's performance
from nltk.sentiment.vader import SentimentIntensityAnalyzer

#train test split
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], 
    labels, 
    test_size = 0.2, 
    random_state= 42
)

In [32]:
#empty lists to store output
vader_labels = []

#initialize analyzer
analyzer = SentimentIntensityAnalyzer()

#iterate and label each comment/post
for i, comment in enumerate(X_test):
    sent = analyzer.polarity_scores(comment)
    compound_score = sent['compound']
    
    #introduce a threshold of -0.05~0.05 for 'neutral' comments; 
    #-1 is very negative, 0 is very neutral, 1 is very positive
    if compound_score > 0.05:
        sent = 2
    elif compound_score < -0.05:
        sent = 1
    else:
        sent = 0
    
    #store output
    vader_labels.append(sent)
    
#view weighted f1 score
print(f"Test F1 score: {f1_score(y_test, vader_labels, average = 'weighted')}")

Test F1 score: 0.30695827836035205


In [33]:
#confusion Matrix
cf = pd.DataFrame(
        confusion_matrix(y_test, vader_labels),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )

print(cf)

#classification report
print("\nClassification report:")
print(classification_report(y_test, vader_labels))

               predicted neutral  predicted negative  predicted positive
true neutral                  16                  49                  53
true negative                 91                  44                  93
true positive                 14                  12                 102

Classification report:
              precision    recall  f1-score   support

           0       0.13      0.14      0.13       118
           1       0.42      0.19      0.26       228
           2       0.41      0.80      0.54       128

    accuracy                           0.34       474
   macro avg       0.32      0.38      0.31       474
weighted avg       0.35      0.34      0.31       474



In [34]:
#compare against stanza's sentiment analyzer
#train test split
X_train, X_test, y_train, y_test = train_test_split(
    df['content'], 
    labels, 
    test_size = 0.2, 
    random_state= 42
)

In [35]:
nlp = stanza.Pipeline('en', processors='tokenize, sentiment')

# define a function to classify sentiment of a comment
def classify_sentiment(comment):
    # analyze the sentiment of the comment
    doc = nlp(comment)
    for i, sentence in enumerate(doc.sentences):
        if sentence.sentiment == 0:
            return 1
        elif sentence.sentiment == 1:
            return 0
        else:
            return 2

# classify the sentiment of each comment
stanza_pred = [classify_sentiment(comment) for comment in X_test]

2023-03-28 13:10:03 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.4.1.json:   0%|   …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/tokenize/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/sentiment/sstplus.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/pretrain/combined.pt:   0%|    …

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/forward_charlm/1billion.pt:   0…

Downloading https://huggingface.co/stanfordnlp/stanza-en/resolve/v1.4.1/models/backward_charlm/1billion.pt:   …

2023-03-28 13:10:20 INFO: Loading these models for language: en (English):
| Processor | Package  |
------------------------
| tokenize  | combined |
| sentiment | sstplus  |

2023-03-28 13:10:20 INFO: Use device: gpu
2023-03-28 13:10:20 INFO: Loading: tokenize
2023-03-28 13:10:24 INFO: Loading: sentiment
2023-03-28 13:10:24 INFO: Done loading processors!


In [36]:
#confusion Matrix
cf = pd.DataFrame(
        confusion_matrix(y_test, stanza_pred),
        index = ['true neutral', 'true negative', 'true positive'],
        columns = ['predicted neutral', 'predicted negative', 'predicted positive']
    )

print(cf)

#classification report
print("\nClassification report:")
print(classification_report(y_test, stanza_pred))

               predicted neutral  predicted negative  predicted positive
true neutral                  42                  65                  11
true negative                159                  46                  23
true positive                 48                  26                  54

Classification report:
              precision    recall  f1-score   support

           0       0.17      0.36      0.23       118
           1       0.34      0.20      0.25       228
           2       0.61      0.42      0.50       128

    accuracy                           0.30       474
   macro avg       0.37      0.33      0.33       474
weighted avg       0.37      0.30      0.31       474



<h3> 3. Labelling the Main Dataset with the Best Model</h3>

In [28]:
##summary table of the optimum xgboost
#form xgboost model with the optimum parameters
xgbm = XGBClassifier(n_estimators = 100, max_depth = 9, learning_rate = 0.9).fit(X_train, y_train)

In [14]:
#same preprocessing
main_df = pd.read_csv('top_posts_year.csv')

#data cleaning
pattern = r'^https?:\/\/.*[\r\n]*'
#remove comments with only hyperlinks
main_df = main_df[~main_df['content'].str.contains(pattern)]
#remove comments which were deleted
main_df = main_df.loc[main_df['content'] != '[deleted]']

#preprocess all comments/posts into new column
main_df['cleaned_content'] = main_df['content'].apply(preprocess_text)

# Split comments into sentences and perform POS tagging on each sentence
sentences = [nltk.sent_tokenize(content) for content in main_df['cleaned_content']]
pos_tagged_sentences = [[nltk.pos_tag(nltk.word_tokenize(sentence)) for sentence in comment] for comment in sentences]

# Extract POS features for each comment; idea is to count the number of tokens belonging to each
#pos tag within each sentence
pos_features = []
for content in pos_tagged_sentences:
    pos_dict = {}
    for sentence in content:
        for word, tag in sentence:
            pos_dict[tag] = pos_dict.get(tag, 0) + 1
    pos_features.append(pos_dict)
    
# Convert POS features to a sparse matrix
pos_features = pos_vectorizer.transform(pos_features)
tfidf_features = tfidf.transform(main_df['cleaned_content'])
bow_features = bow.transform(main_df['cleaned_content'])

#combine tfidf, bow  and pos features into a single array
tfidf_pos = np.concatenate((tfidf_features.toarray(), bow_features.toarray(), pos_features), axis = 1)

#add score to the array
tfidf_pos = np.c_[tfidf_pos, main_df['score']]

In [16]:
#predict
sents = []
for i in xgbm.predict(tfidf_pos):
    sent = ''
    if i == 0:
        sent = 'negative'
    elif i == 1:
        sent = 'neutral'
    else:
        sent = 'positive'
    sents.append(sent)
    
main_df['pred_sentiment'] = sents

#save dont run as it'll overwrite the current file
#main_df.to_csv('predict_output.csv', index = False)