### Step 0: Clean the Reviews dataset

In [None]:
import re
def clean_review_data():

    with open("/Users/akshay/Downloads/australian_user_reviews.json", "r") as raw_input:
        with open("/Users/akshay/Downloads/australian_user_reviews_cleaned.json", "w") as cleaned_file:
            try:
                for line in raw_input:
                    withoutSingleQuotes = line.replace('\'', '\"')
                    withoutSingleQuotes = withoutSingleQuotes.replace("True", "true")
                    withoutSingleQuotes = withoutSingleQuotes.replace("False", "false")
                    main_review_indexes = [m.start() for m in re.finditer("\"review\"", withoutSingleQuotes)]
                    for main_review_index in main_review_indexes:
                        main_review_index = main_review_index + 11
                        current_brace_index = withoutSingleQuotes[main_review_index:].find("}")
                        temp = withoutSingleQuotes[main_review_index: main_review_index + current_brace_index - 1]
                        repeatingDoubleQuotesIndexes = [m.start() for m in re.finditer("\"", temp)]
                        for i in repeatingDoubleQuotesIndexes:
                            toReplaceIndex = main_review_index + i
                            withoutSingleQuotes = withoutSingleQuotes[:toReplaceIndex] + "'" + withoutSingleQuotes[toReplaceIndex + 1:]
                    withoutSingleQuotes = withoutSingleQuotes.replace("\\", "")
                    cleaned_file.write(withoutSingleQuotes)
            except:
                pass
if __name__ == "__main__":
    clean_review_data()

### Step 1: Install the required packages

In [4]:
#!pip3 install -U spacy

# Download spacy's pretrained statistical models for English language
# We are downloading the "en_core_web_lg", which is English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl
# The size of this download pretrained models is 746MB. The word vectors included in this model has 685k keys, 685k unique vectors (300 dimensions)

#!python3 -m spacy download en
#!python3 -m spacy download en_core_web_lg

### Step 2: Generate the formatted training dataset from raw data

From the cleaned reviews data, generate training data in the following format:

|review_text                                | sentiment 	|
|-----------------------------------------	|-----------	|
|I liked this game                       	| 1         	|
|I found the game a little underwhelming 	| 0         	|
|the best sequel to the Star Wars game!  	| 1         	|

In [17]:
import json
def generate_test_data():
    resultRows = []
    with open("/Users/akshay/Downloads/australian_user_reviews_cleaned.json", "r") as cleaned_file:
        for line in cleaned_file:
            try:
                json_obj = json.loads(line)
                reviews = json_obj["reviews"]
                for review in reviews:
                    review_text = review["review"]
                    sentiment = review["recommend"]
                    resultRows.append([review_text, sentiment])
            except Exception as e:
                pass
                    
    with open("/Users/akshay/Downloads/experiment_2_sentiment_dataset.tsv", "w") as sentiment_dataset:
        for entry in resultRows:
            sentiment_dataset.write(entry[0] + "\t" + str(int(entry[1])) + "\n")
                

if __name__ == "__main__":
    generate_test_data()

In [18]:
import pandas as pd
review_df = pd.read_csv("/Users/akshay/Downloads/experiment_2_sentiment_dataset.tsv", sep='\t', header = None)
columns_name = ['review_text', 'sentiment']
review_df.columns = columns_name
review_df.head()

Unnamed: 0,review_text,sentiment
0,Simple yet with great replayability. In my opi...,1
1,It's unique and worth a playthrough.,1
2,Great atmosphere. The gunplay can be a bit chu...,1
3,I know what you think when you see this title ...,1
4,For a simple (it's actually not all that simpl...,1


In [19]:
review_df.shape

(59278, 2)

In [20]:
review_df['sentiment'].value_counts()

1    52449
0     6829
Name: sentiment, dtype: int64

### Step 3: Create the NLP Pipeline

In [35]:
import spacy
import string

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.svm import LinearSVC

from spacy.lang.en.stop_words import STOP_WORDS
from spacy import displacy

nlp = spacy.load('en_core_web_lg')

In [43]:
def review_tokenizer(review):
    review = str(review)
    document = nlp(review)
    
    # Collect all the token from the review text. Need extra check for pronouns
    tokens = []
    for token in document:
        if token.lemma_ != "-PRON-":
            doc_tokens = token.lemma_.lower().strip()
        else:
            doc_tokens = token.lower_
        tokens.append(doc_tokens)
    
    # Remove tokens which are stopwords or punctuation
    punctuation = string.punctuation
    stopwords = list(STOP_WORDS)
    cleaned_tokens = []
    for token in tokens:
        if token not in stopwords and token not in punctuation:
            cleaned_tokens.append(token)
    return cleaned_tokens

In [None]:
review_tokenizer("Simple yet with great replayability. In my opinion does 'zombie' hordes and team work better than left 4 dead plus has a global leveling system. Alot of down to earth 'zombie' splattering fun for the whole family. Amazed this sort of FPS is so rare.")

In [45]:
tfidf = TfidfVectorizer(tokenizer = review_tokenizer)
classifier = LinearSVC()
sentiment_pipeline = Pipeline([('tfidf', tfidf), ('clf', classifier)])

### Step 4. Train the Model

In [48]:
review_df = review_df.dropna()
X = review_df['review_text']
y = review_df['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 256)
sentiment_pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidf',
                 TfidfVectorizer(tokenizer=<function review_tokenizer at 0x15c833160>)),
                ('clf', LinearSVC())])

In [50]:
y_pred = sentiment_pipeline.predict(X_test)

In [51]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.67      0.39      0.49      1392
           1       0.92      0.97      0.95     10458

    accuracy                           0.91     11850
   macro avg       0.80      0.68      0.72     11850
weighted avg       0.89      0.91      0.89     11850



In [52]:
confusion_matrix(y_test, y_pred)

array([[  541,   851],
       [  262, 10196]])