In [6]:
import pandas as pd
import numpy as np
import re
import pyprind

import sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer, HashingVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.decomposition import LatentDirichletAllocation

import nltk
from nltk.stem.porter import PorterStemmer
nltk.download('stopwords')
nltk.download('vader_lexicon')
from nltk.corpus import stopwords
from nltk.sentiment import SentimentIntensityAnalyzer

stop = stopwords.words('english')
sia = SentimentIntensityAnalyzer()

print('Setup Complete')

Setup Complete


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/jordansamek/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/jordansamek/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [7]:
df = pd.read_csv('reviews_sentiment.csv')
df.head()

Unnamed: 0,review_stars,text,rev_length,sentiment_score,sentiment
0,3,Decent food at reasonable prices. Ambiance is ...,57,-0.3182,neutral
1,4,"Good food, and good sized portions for the pri...",777,0.9392,positive
2,4,The name of the venue is intriguing but after ...,686,0.97,positive
3,4,Really good Thai food. Had been looking a dece...,238,0.9243,positive
4,5,Bookmarked AND the hubby recommended it. Our f...,3392,0.9989,positive


In [8]:
# going to replace and recalculate sentiments
df.drop(['review_stars', 'rev_length', 'sentiment_score', 'sentiment'], axis=1, inplace=True)

In [9]:
def compound_score(txt):
  return sia.polarity_scores(txt)['compound']

def sentiment(score):
  emotion = ""
  if score >= 0:
    emotion = "positive"
  elif score < 0:
    emotion = "negative"
  return emotion

In [10]:
polarity_scores = df['text'].astype("str").apply(compound_score)
df["sentiment_score"] = polarity_scores

df["sentiment"] = df["sentiment_score"].apply(sentiment)

In [11]:
df.head()

Unnamed: 0,text,sentiment_score,sentiment
0,Decent food at reasonable prices. Ambiance is ...,-0.3182,negative
1,"Good food, and good sized portions for the pri...",0.9392,positive
2,The name of the venue is intriguing but after ...,0.97,positive
3,Really good Thai food. Had been looking a dece...,0.9243,positive
4,Bookmarked AND the hubby recommended it. Our f...,0.9989,positive


In [15]:
df.sentiment.value_counts()

positive    375530
negative     51260
Name: sentiment, dtype: int64

In [16]:
df.drop('sentiment_score', axis=1, inplace=True)

In [19]:
df.shape

(426790, 2)

In [20]:
df.to_csv('revised_review_sentiments.csv', index=False) # saving as new csv

In [21]:
from sklearn.preprocessing import LabelEncoder

lab_enc = LabelEncoder()
df['sentiment'] = lab_enc.fit_transform(df['sentiment'])

In [23]:
df.sample(5)

Unnamed: 0,text,sentiment
9881,I seated myself at the bar hoping for a quick ...,1
372862,This was my second venture to Sunset Burger an...,1
142982,Since moving to the West End I've been trying ...,1
203241,"It's pretty good, all things considering. I l...",1
70581,Cute bright little Taiwanese cafe just a block...,1


In [28]:
df.loc[184, 'text'][:1000]

"My favorite Greek food and pizza place in Burnaby by a long shot. Portions are huge. We order regularly and it's always consistent and delicious! The staff there are always great, and you can tell they genuinely take pride in what they do."

In [24]:
from nltk.tokenize import word_tokenize

In [33]:
def preprocess_df(text):
    porter = PorterStemmer()
    text = text.lower() # convert to lowercase
    text = re.sub(r'[\W]+', ' ', text) # remove all non-word characters
    # stem, tokenize and remove stop words
    text = ' '.join([porter.stem(word) for word in word_tokenize(text) if (word not in stop)])
    return text

In [35]:
%%time
df['text'] = df['text'].apply(preprocess_df)

CPU times: user 11min 38s, sys: 2.06 s, total: 11min 40s
Wall time: 11min 41s


In [36]:
df.sample(5)

Unnamed: 0,text,sentiment
246246,stop randomli tonight restaur brand new realli...,1
191476,cool everyth place made smile love everi minut...,1
257693,came sunday brunch offer brunch lunch menu man...,1
203903,guy work total jerk friend look forward date t...,0
52201,absolut delici lemon grass chicken vermicelli ...,1


In [37]:
from sklearn.model_selection import train_test_split

In [48]:
X = df['text']
y = df['sentiment']
print(X.shape, y.shape)

(426790,) (426790,)


In [50]:
# train set split
X_train, X_rem, y_train, y_rem = train_test_split(X, y, test_size=0.25, stratify=y, random_state=2022)

# test set and validation set split
X_test, X_valid, y_test, y_valid = train_test_split(X_rem, y_rem, test_size=0.15, random_state=2022)

print(X_train.shape, y_train.shape)
print(X_test.shape, y_test.shape)
print(X_valid.shape, y_valid.shape)

(320092,) (320092,)
(90693,) (90693,)
(16005,) (16005,)


In [52]:
%%time
tfidf = TfidfVectorizer(strip_accents=None,
                        lowercase=False,
                        preprocessor=None)

param_grid = [{'vect__ngram_range': [(1, 1)],
               'clf__penalty': ['l1', 'l2'],
               'clf__C': [1.0, 10.0, 100.0]},
             {'vect__ngram_range': [(1, 1)],
              'vect__use_idf': [False],
              'vect__norm': [None],
              'clf__penalty': ['l1', 'l2'],
              'clf__C': [1.0, 10.0, 100.0]}
             ]

lr_tfidf = Pipeline([('vect', tfidf),
                     ('clf', LogisticRegression(random_state=2022, 
                                                solver='liblinear'))])

gs_lr_tfidf = GridSearchCV(lr_tfidf, param_grid,
                        scoring='accuracy',
                        cv=7, verbose=2,
                        n_jobs=-1)

gs_lr_tfidf.fit(X_train, y_train)

Fitting 7 folds for each of 12 candidates, totalling 84 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:  4.2min


KeyboardInterrupt: 