In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.dummy import DummyClassifier
from project_functions import *

### Import DF

In [2]:
df = pd.read_csv('sqr_comments_sentiment.csv')

---

### Clean Text

In [3]:
# Clean text function from project_functions file. Removes punctuation, whitespace, numbers, and makes text lowercase
cleanText(df, 'comments')

---

### Train Test Split

In [4]:
#TRAIN/TEST SPLIT
X = df['comments'].values
y = df['sqr_rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

---

### Instantiate, fit, and encode using TfidfVectorizer

In [5]:
vectorizer = TfidfVectorizer()

In [6]:
tf_idf_data_train = vectorizer.fit_transform(X_train)

In [7]:
tf_idf_data_test = vectorizer.transform(X_test)

---

### Baseline: Dummy Classifier

In [8]:
#Fitting & predicting the Dummy Classifier (Baseline Model)
dclf = DummyClassifier() 
dclf.fit(tf_idf_data_train, y_train)
dummy_test_preds = dclf.predict(X_test)



In [9]:
# Get scores
dummy_acc_test_score = accuracy_score(y_test, dummy_test_preds)
dummy_f1_test_score = f1_score(y_test, dummy_test_preds, average='macro')
dummy_scores = pd.DataFrame({'Model':['Dummy Classifier'], 'Test Accuracy':[dummy_acc_test_score], 'Test F1':[dummy_f1_test_score]})
dummy_scores

Unnamed: 0,Model,Test Accuracy,Test F1
0,Dummy Classifier,0.214575,0.138562


---

### Naive Bayes Classifier

In [15]:
# Instantiate Naieve Bayes Classifier
nb_classifier = MultinomialNB(alpha=.85, fit_prior=True)

# Predict using Naieve Bayes Classifier
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

# Get scores
nb_acc_train_score = accuracy_score(y_train, nb_train_preds)
nb_acc_test_score = accuracy_score(y_test, nb_test_preds)
nb_f1_train_score = f1_score(y_train, nb_train_preds, average='macro')
nb_f1_test_score = f1_score(y_test, nb_test_preds, average='macro')
nb_scores = pd.DataFrame({'Model':['Naieve Bayes'], 'Train Accuracy':[nb_acc_train_score], 'Test Accuracy':[nb_acc_test_score], 'Train F1':[nb_f1_train_score], 'Test F1':[nb_f1_test_score]})
nb_scores

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train F1,Test F1
0,Naieve Bayes,0.584011,0.315789,0.235356,0.129129


### Random Forest Classifier

In [54]:
# Instantiate Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
# rf_classifier = RandomForestClassifier(max_depth=20, n_estimators=1000, min_samples_leaf=.001, n_jobs=-1)

# Predict using Random Forest Classifier
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

# Get scores
rf_acc_train_score = accuracy_score(y_train, rf_train_preds)
rf_acc_test_score = accuracy_score(y_test, rf_test_preds)
rf_f1_train_score = f1_score(y_train, rf_train_preds, average='macro')
rf_f1_test_score = f1_score(y_test, rf_test_preds, average='macro')
rf_scores = pd.DataFrame({'Model':['Random Forest'], 'Train Accuracy':[rf_acc_train_score], 'Test Accuracy':[rf_acc_test_score], 'Train F1':[rf_f1_train_score], 'Test F1':[rf_f1_test_score]})
rf_scores

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train F1,Test F1
0,Random Forest,0.99729,0.315789,0.998572,0.1541
