In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score, f1_score
from project_functions import *
import spacy
from spacy.lang.en import English
spacy.load('en')
parser = English()



In [2]:
df = pd.read_csv('sqr_comments_sentiment.csv')

In [3]:
cleanText(df, 'comments')

In [6]:
# Train test split
train, test = train_test_split(df, random_state=333)

In [11]:
# Spacy's standard transformer
STOPLIST = set(stopwords.words('english') + list(ENGLISH_STOP_WORDS))
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-", "...", "”", "”"]

class CleanTextTransformer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    
def cleanText(text):
    text = text.strip().replace("\n", " ").replace("\r", " ")
    text = text.lower()
    return text

def tokenizeText(sample):
    tokens = parser(sample)
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas
    tokens = [tok for tok in tokens if tok not in STOPLIST]
    tokens = [tok for tok in tokens if tok not in SYMBOLS]
    return tokens

In [12]:
# Instantiate vectorizer, classifier, and pipeline
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = LinearSVC(tol=1e-3, C=.5, dual=False, max_iter=2000)
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

# Create training and testing dependent/independent variables
train1 = train['comments'].tolist()
labelsTrain1 = train['sqr_rating'].tolist()

test1 = test['comments'].tolist()
labelsTest1 = test['sqr_rating'].tolist()

# Fit the LinearSVC pipeline to the training data
pipe.fit(train1, labelsTrain1)

# Training predictions 
train_preds = pipe.predict(train1)
print("Train Accuracy:", accuracy_score(labelsTrain1, train_preds))
print("Train F1:", f1_score(labelsTrain1, train_preds, average='macro'))

# Testing predictions
preds = pipe.predict(test1)
print("Test Accuracy:", accuracy_score(labelsTest1, preds))
print("Test F1:", f1_score(labelsTest1, preds, average='macro'))

Train Accuracy: 0.997289972899729
Train F1: 0.9986190977557803
Test Accuracy: 0.2874493927125506
Test F1: 0.17911887395758364
Top 10 features used to predict: 
