In [39]:
import pandas as pd
import numpy as np
import nltk
from nltk import word_tokenize, FreqDist
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from project_functions import *

### Import DF

In [2]:
df = pd.read_csv('sqr_comments_sentiment.csv')

In [6]:
df.sqr_rating.value_counts()

0.750    321
0.875    278
1.000    221
0.625    132
0.500     39
0.375     13
0.250      1
Name: sqr_rating, dtype: int64

---

### Convert sqr_rating to integers

In [7]:
df = df.replace({'sqr_rating':{0.250:1, 0.375:2, 0.500:3, 0.625:4, 0.750:5, 0.875:6, 1:7}})

---

In [9]:
# Clean text function from project_functions file. Removes punctuation, whitespace, numbers, and makes text lowercase
cleanText(df, 'comments')

In [11]:
# Create list of all documents
data = list(df.comments)
# Target variable label names
label_names = list(df['sqr_rating'].unique())
# Create numpy array of target values
target = df['sqr_rating'].values
# target = np.array(df['sqr_rating'].values)

In [15]:
# Create function to tokenize documents
def process_article(article):
    tokens = nltk.word_tokenize(article)
    return tokens 

In [16]:
# Create tokens for each document
processed_data = list(map(process_article, data))

In [17]:
# Tokens for each document
processed_data[1]

['wrong',
 'person',
 'the',
 'negative',
 'comments',
 'dont',
 'do',
 'justice',
 'to',
 'ps',
 'our',
 'kid',
 'is',
 'in',
 'asd',
 'nest',
 'program',
 'and',
 'its',
 'been',
 'great',
 'experience',
 'the',
 'school',
 'is',
 'small',
 'and',
 'welcome',
 'the',
 'class',
 'size',
 'is',
 'small',
 'the',
 'teachers',
 'are',
 'experienced',
 'and',
 'all',
 'the',
 'services',
 'for',
 'special',
 'ed',
 'are',
 'in',
 'place',
 'the',
 'science',
 'teacher',
 'mr',
 'walsh',
 'is',
 'excellent',
 'the',
 'school',
 'is',
 'in',
 'the',
 'process',
 'of',
 'building',
 'a',
 'new',
 'playground',
 'starting',
 'this',
 'school',
 'year',
 'there',
 'is',
 'no',
 'uniform',
 'policy',
 'only',
 'thing',
 'that',
 'it',
 'could',
 'improve',
 'is',
 'communication',
 'between',
 'the',
 'school',
 'the',
 'teachers',
 'with',
 'parents',
 'its',
 'a',
 'true',
 'hidden',
 'gem',
 'of',
 'district',
 'i',
 'suggest',
 'that',
 'all',
 'children',
 'go',
 'to',
 'ps',
 'that',
 'li

---

---

### Train Test Split

In [23]:
#TRAIN/TEST SPLIT
X = df['comments'].values
y = df['sqr_rating'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=333)

---

### Instantiate, fit, and encode using TfidfVectorizer

In [24]:
vectorizer = TfidfVectorizer()

In [25]:
tf_idf_data_train = vectorizer.fit_transform(X_train)

In [26]:
tf_idf_data_test = vectorizer.transform(X_test)

---

### Naive Bayes Classifier

In [55]:
# Instantiate Naieve Bayes Classifier
nb_classifier = MultinomialNB()

# Predict using Naieve Bayes Classifier
nb_classifier.fit(tf_idf_data_train, y_train)
nb_train_preds = nb_classifier.predict(tf_idf_data_train)
nb_test_preds = nb_classifier.predict(tf_idf_data_test)

# Get scores
nb_acc_train_score = accuracy_score(y_train, nb_train_preds)
nb_acc_test_score = accuracy_score(y_test, nb_test_preds)
nb_f1_train_score = f1_score(y_train, nb_train_preds, average='macro')
nb_f1_test_score = f1_score(y_test, nb_test_preds, average='macro')
nb_scores = pd.DataFrame({'Model':['Naieve Bayes'], 'Train Accuracy':[nb_acc_train_score], 'Test Accuracy':[nb_acc_test_score], 'Train F1':[nb_f1_train_score], 'Test F1':[nb_f1_test_score]})
nb_scores

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train F1,Test F1
0,Naieve Bayes,0.504648,0.357143,0.215321,0.106787


In [59]:
# Instantiate Random Forest Classifier
rf_classifier = RandomForestClassifier()

# Predict using Random Forest Classifier
rf_classifier.fit(tf_idf_data_train, y_train)
rf_train_preds = rf_classifier.predict(tf_idf_data_train)
rf_test_preds = rf_classifier.predict(tf_idf_data_test)

# Get scores
rf_acc_train_score = accuracy_score(y_train, rf_train_preds)
rf_acc_test_score = accuracy_score(y_test, rf_test_preds)
rf_f1_train_score = f1_score(y_train, rf_train_preds, average='micro')
rf_f1_test_score = f1_score(y_test, rf_test_preds, average='micro')
rf_scores = pd.DataFrame({'Model':['Random Forest'], 'Train Accuracy':[rf_acc_train_score], 'Test Accuracy':[rf_acc_test_score], 'Train F1':[rf_f1_train_score], 'Test F1':[rf_f1_test_score]})
rf_scores

Unnamed: 0,Model,Train Accuracy,Test Accuracy,Train F1,Test F1
0,Random Forest,0.998672,0.325397,0.998672,0.325397
