In [17]:
import pandas as pd
import numpy as np
import nltk
import string
import re
from nltk import word_tokenize, FreqDist
from nltk.corpus import stopwords
from sklearn.utils import resample
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.model_selection import train_test_split
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from project_functions import *
import spacy
from spacy.lang.en import English
spacy.load('en')
parser = English()

### Import DF

In [18]:
df = pd.read_csv('csv/sqr_comments_sentiment.csv')

---

### Clean Text

In [19]:
# Clean text function from project_functions file. Removes punctuation, whitespace, numbers, and makes text lowercase
cleanText(df, 'comments')

---

### Upsample minority class to address class imbalance

In [22]:
# separate minority and majority classes
negative = df[df.compound_binary==0]
positive = df[df.compound_binary==1]

# upsample minority
negative_upsampled = resample(negative,
                          replace=True, # sample with replacement
                          n_samples=len(positive), # match number in majority class
                          random_state=23) # reproducible results

# combine majority and upsampled minority
df = pd.concat([positive, negative_upsampled])

---

### Train Test Split

In [5]:
#TRAIN/TEST SPLIT
X = df['comments'].values
y = df['compound_binary'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

---

### Instantiate, fit, and encode using TfidfVectorizer

In [6]:
vectorizer = TfidfVectorizer()
tf_idf_data_train = vectorizer.fit_transform(X_train)
tf_idf_data_test = vectorizer.transform(X_test)

---

### Baseline: Dummy Classifier

In [9]:
#Fitting & predicting the Dummy Classifier (Baseline Model)
dclf = DummyClassifier() 
dclf.fit(tf_idf_data_train, y_train)
dummy_test_preds = dclf.predict(X_test)



In [10]:
# Get scores
dummy_prec_test_score = precision_score(y_test, dummy_test_preds)
dummy_recall_test_score = recall_score(y_test, dummy_test_preds)
dummy_f1_test_score = f1_score(y_test, dummy_test_preds, average='macro')
dummy_scores = pd.DataFrame({'Model':['Dummy Classifier'], 'Precision':[dummy_prec_test_score], 'Recall':[dummy_recall_test_score], 'Test F1':[dummy_f1_test_score]})
dummy_scores

Unnamed: 0,Model,Precision,Recall,Test F1
0,Dummy Classifier,0.81,0.81407,0.511278


---

### Naive Bayes Classifier

In [11]:
# Instantiate Naieve Bayes Classifier
nb_classifier = MultinomialNB(alpha=.85, fit_prior=True)

# Predict using Naieve Bayes Classifier
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

# Get scores
nb_prec_train_score = precision_score(y_train, nb_train_preds)
nb_prec_test_score = precision_score(y_test, nb_test_preds)
nb_recall_train_score = recall_score(y_train, nb_train_preds)
nb_recall_test_score = recall_score(y_test, nb_test_preds)
nb_f1_train_score = f1_score(y_train, nb_train_preds, average='macro')
nb_f1_test_score = f1_score(y_test, nb_test_preds, average='macro')
nb_scores = pd.DataFrame({'Model':['Naieve Bayes'], 'Train Precision':[nb_prec_train_score], 'Test Precision':[nb_prec_test_score], 'Train Recall':[nb_recall_train_score], 'Test Recall':[nb_recall_test_score], 'Train F1':[nb_f1_train_score], 'Test F1':[nb_f1_test_score]})
nb_scores

Unnamed: 0,Model,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Naieve Bayes,0.803523,0.805668,1.0,1.0,0.44553,0.446188


### Random Forest Classifier

In [12]:
# Instantiate Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
# rf_classifier = RandomForestClassifier(max_depth=20, n_estimators=1000, min_samples_leaf=.001, n_jobs=-1)

# Predict using Random Forest Classifier
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

# Get scores
rf_prec_train_score = precision_score(y_train, rf_train_preds)
rf_prec_test_score = precision_score(y_test, rf_test_preds)
rf_recall_train_score = recall_score(y_train, rf_train_preds)
rf_recall_test_score = recall_score(y_test, rf_test_preds)
rf_f1_train_score = f1_score(y_train, rf_train_preds, average='macro')
rf_f1_test_score = f1_score(y_test, rf_test_preds, average='macro')
rf_scores = pd.DataFrame({'Model':['Random Forest'], 'Train Precision':[rf_prec_train_score], 'Test Precision':[rf_prec_test_score], 'Train Recall':[rf_recall_train_score], 'Test Recall':[rf_recall_test_score], 'Train F1':[rf_f1_train_score], 'Test F1':[rf_f1_test_score]})
rf_scores

Unnamed: 0,Model,Train Precision,Test Precision,Train Recall,Test Recall,Train F1,Test F1
0,Random Forest,1.0,0.815574,1.0,1.0,1.0,0.508033


In [13]:
# Train test split
train, test = train_test_split(df, random_state=333)

In [14]:
# Spacy's standard transformer
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [15]:
# Instantiate vectorizer, classifier, and pipeline
vectorizer = TfidfVectorizer(tokenizer=tokenizeText)
clf = LinearSVC(tol=1e-3, C=.5, dual=False, max_iter=2000)
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# Create training and testing dependent/independent variables
train1 = train['comments'].tolist()
print(train1)
labelsTrain1 = train['compound_binary'].tolist()

test1 = test['comments'].tolist()
labelsTest1 = test['compound_binary'].tolist()


# Fit the LinearSVC pipeline to the training data
pipe.fit(train1, labelsTrain1)

# Training predictions 
train_preds = pipe.predict(train1)
print("Train Precision:", precision_score(labelsTrain1, train_preds))
print("Train Recall:", recall_score(labelsTrain1, train_preds))
print("Train F1:", f1_score(labelsTrain1, train_preds, average='macro'))

# Testing predictions
preds = pipe.predict(test1)
print("Test Precision:", precision_score(labelsTest1, preds))
print("Test Recall:", recall_score(labelsTest1, preds))
print("Test F1:", f1_score(labelsTest1, preds, average='macro'))

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



Train Precision: 0.9867986798679867
Train Recall: 1.0
Train F1: 0.9819718585108462
Test Precision: 0.803347280334728
Test Recall: 0.9896907216494846
Test F1: 0.5417786695945179
