Versions

numpy==1.26.3

pandas==2.1.4

nltk==3.8.1

seaborn==0.13.1

bs4==0.0.1

scikit-learn==1.3.2

torch==2.1.2

torchaudio==2.1.2

torchvision==0.16.2

gensim==4.3.2

In [47]:
import pandas as pd
import numpy as np
import nltk
import re
import seaborn as sns
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import torch

Reading Data

In [48]:
original_df = pd.read_csv(
    "amazon_reviews_us_Office_Products_v1_00.tsv",
    sep="\t",
    on_bad_lines="skip",
    low_memory=False
)

In [49]:
df = original_df[:]

In [50]:
df.head(3)

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,US,43081963,R18RVCKGH1SSI9,B001BM2MAC,307809868,"Scotch Cushion Wrap 7961, 12 Inches x 100 Feet",Office Products,5,0.0,0.0,N,Y,Five Stars,Great product.,2015-08-31
1,US,10951564,R3L4L6LW1PUOFY,B00DZYEXPQ,75004341,"Dust-Off Compressed Gas Duster, Pack of 4",Office Products,5,0.0,1.0,N,Y,"Phffffffft, Phfffffft. Lots of air, and it's C...",What's to say about this commodity item except...,2015-08-31
2,US,21143145,R2J8AWXWTDX2TF,B00RTMUHDW,529689027,Amram Tagger Standard Tag Attaching Tagging Gu...,Office Products,5,0.0,0.0,N,Y,but I am sure I will like it.,"Haven't used yet, but I am sure I will like it.",2015-08-31


In [51]:
df = df.dropna(subset="review_headline")
df = df.dropna(subset="review_body")

df["review_text"] = df["review_headline"] + " " + df["review_body"]

columns = ["star_rating", "review_text"]

df = df[columns]

df.loc[:, ("star_rating")] = pd.to_numeric(df["star_rating"], errors='coerce')

df = df.dropna(subset="star_rating")
df = df.dropna(subset="review_text")

In [52]:
df.head(3)

Unnamed: 0,star_rating,review_text
0,5,Five Stars Great product.
1,5,"Phffffffft, Phfffffft. Lots of air, and it's C..."
2,5,but I am sure I will like it. Haven't used yet...


In [53]:
samples_per_rating = 50000
# samples_per_rating = 50

df1 = df.loc[df["star_rating"] == 1]
df1 = df1[:samples_per_rating]

df2 = df.loc[df["star_rating"] == 2]
df2 = df2[:samples_per_rating]

df3 = df.loc[df["star_rating"] == 3]
df3 = df3[:samples_per_rating]

df4 = df.loc[df["star_rating"] == 4]
df4 = df4[:samples_per_rating]

df5 = df.loc[df["star_rating"] == 5]
df5 = df5[:samples_per_rating]

df = pd.concat([df1, df2, df3, df4, df5], ignore_index=True)
df = df.sort_index()

df.head()

Unnamed: 0,star_rating,review_text
0,1,and the shredder was dirty and the bin was par...
1,1,One Star worked about a month then died
2,1,One Star The phone did not work. No Dial Tone...
3,1,One Star Not laminated and no reinforced holes...
4,1,"One Star Cartridge was over filled, black smea..."


In [54]:
df["sentiment_score"] = 3

df.loc[df["star_rating"] > 3, "sentiment_score"] = 2
df.loc[df["star_rating"] < 3, "sentiment_score"] = 1

In [55]:
df.head()

Unnamed: 0,star_rating,review_text,sentiment_score
0,1,and the shredder was dirty and the bin was par...,1
1,1,One Star worked about a month then died,1
2,1,One Star The phone did not work. No Dial Tone...,1
3,1,One Star Not laminated and no reinforced holes...,1
4,1,"One Star Cartridge was over filled, black smea...",1


Data Cleaning

In [56]:
# Transform string to lower case
df["review_text"] = df["review_text"].str.lower()

# Remove html tags
df["review_text"] = df["review_text"].apply(
    lambda x: BeautifulSoup(x).get_text()
)

# Remove urls
df["review_text"] = df["review_text"].apply(
    lambda x: re.sub('http[s]?://\S+', '', x)
)

# remove any non-alphabetic charcters
df["review_text"] = df["review_text"].apply(
    lambda x: re.sub(r'[^a-z\s]+', '', x)
)

# remove any extra spaces
df["review_text"] = df["review_text"].apply(
    lambda x: re.sub(r' +', ' ', x)
)

  lambda x: BeautifulSoup(x).get_text()


In [57]:
contractions = {
    re.compile(r"\bain't\b", re.I | re.U): "are not",
    re.compile(r"\ba'ight\b", re.I | re.U): "alright",
    re.compile(r"\bamn't\b", re.I | re.U): "am not",
    re.compile(r"\bn\b", re.I | re.U): "and",
    re.compile(r"\barencha\b", re.I | re.U): "are not you",
    re.compile(r"\baren't\b", re.I | re.U): "are not",
    re.compile(r"\b'bout\b", re.I | re.U): "about",
    re.compile(r"\bcan't\b", re.I | re.U): "cannot",
    re.compile(r"\bcap'n\b", re.I | re.U): "captain",
    re.compile(r"\bcause\b", re.I | re.U): "because",
    re.compile(r"\bcuz\b", re.I | re.U): "because",
    re.compile(r"\bcept\b", re.I | re.U): "except",
    re.compile(r"\bc'mon\b", re.I | re.U): "come on",
    re.compile(r"\bcould've\b", re.I | re.U): "could have",
    re.compile(r"\bcouldn't\b", re.I | re.U): "could not",
    re.compile(r"\bcouldn't've\b", re.I | re.U): "could not have",
    re.compile(r"\bcuppa\b", re.I | re.U): "cup of",
    re.compile(r"\bdaren't\b", re.I | re.U): "dare not",
    re.compile(r"\bdaresn't\b", re.I | re.U): "dare not",
    re.compile(r"\bdasn't\b", re.I | re.U): "dare not",
    re.compile(r"\bdidn't\b", re.I | re.U): "did not",
    re.compile(r"\bdoesn't\b", re.I | re.U): "does not",
    re.compile(r"\bdon't\b", re.I | re.U): "do not",
    re.compile(r"\bdunno\b", re.I | re.U): "do not know",
    re.compile(r"\bd'ya\b", re.I | re.U): "did you",
    re.compile(r"\be'en\b", re.I | re.U): "even",
    re.compile(r"\be'er\b", re.I | re.U): "ever",
    re.compile(r"\beverybody's\b", re.I | re.U): "everybody is",
    re.compile(r"\beveryone's\b", re.I | re.U): "everyone is",
    re.compile(r"\beverything's\b", re.I | re.U): "everything is",
    re.compile(r"\b'em\b", re.I | re.U): "them",
    re.compile(r"\bfinna\b", re.I | re.U): "fixing to",
    re.compile(r"\bfo'c'sle\b", re.I | re.U): "forecastle",
    re.compile(r"\bgainst\b", re.I | re.U): "against",
    re.compile(r"\bg'day\b", re.I | re.U): "good day",
    re.compile(r"\bgimme\b", re.I | re.U): "give me",
    re.compile(r"\bgiv'n\b", re.I | re.U): "given",
    re.compile(r"\bgi'z\b", re.I | re.U): "give us",
    re.compile(r"\bgonna\b", re.I | re.U): "going to",
    re.compile(r"\bgon't\b", re.I | re.U): "go not",
    re.compile(r"\bgotta\b", re.I | re.U): "got to",
    re.compile(r"\bhadn't\b", re.I | re.U): "had not",
    re.compile(r"\bhad've\b", re.I | re.U): "had have",
    re.compile(r"\bhasn't\b", re.I | re.U): "has not",
    re.compile(r"\bhaven't\b", re.I | re.U): "have not",
    re.compile(r"\bhelluva\b", re.I | re.U): "hell of a",
    re.compile(r"\bhe'd\b", re.I | re.U): "he would",
    re.compile(r"\bhe'll\b", re.I | re.U): "he will",
    re.compile(r"\bhe's\b", re.I | re.U): "he is",
    re.compile(r"\byes'nt\b", re.I | re.U): "no",
    re.compile(r"\bhow'd\b", re.I | re.U): "how did",
    re.compile(r"\bhere's\b", re.I | re.U): "here is",
    re.compile(r"\bhowdy\b", re.I | re.U): "how do you do",
    re.compile(r"\bhow'll\b", re.I | re.U): "how will",
    re.compile(r"\bhow're\b", re.I | re.U): "how are",
    re.compile(r"\bI'd've\b", re.I | re.U): "I would have",
    re.compile(r"\bI'd'nt\b", re.I | re.U): "I would not",
    re.compile(r"\bI'd'nt've\b", re.I | re.U): "I would not have",
    re.compile(r"\bIf'n\b", re.I | re.U): "If and when",
    re.compile(r"\bI'm\b", re.I | re.U): "I am",
    re.compile(r"\bImma\b", re.I | re.U): "I am going to",
    re.compile(r"\bI'm'o\b", re.I | re.U): "I am going to",
    re.compile(r"\binnit\b", re.I | re.U): "is it not",
    re.compile(r"\bIon\b", re.I | re.U): "I do not",
    re.compile(r"\bI've\b", re.I | re.U): "I have",
    re.compile(r"\bisn't\b", re.I | re.U): "is not",
    re.compile(r"\bit'd\b", re.I | re.U): "it would",
    re.compile(r"\bIdunno\b", re.I | re.U): "I do not know",
    re.compile(r"\bkinda\b", re.I | re.U): "kind of",
    re.compile(r"\blet's\b", re.I | re.U): "let us",
    re.compile(r"\bloven't\b", re.I | re.U): "love not",
    re.compile(r"\bma'am\b", re.I | re.U): "madam",
    re.compile(r"\bmayn't\b", re.I | re.U): "may not",
    re.compile(r"\bmay've\b", re.I | re.U): "may have",
    re.compile(r"\bmethinks\b", re.I | re.U): "I think",
    re.compile(r"\bmightn't\b", re.I | re.U): "might not",
    re.compile(r"\bmight've\b", re.I | re.U): "might have",
    re.compile(r"\bmine's\b", re.I | re.U): "mine is",
    re.compile(r"\bmustn't\b", re.I | re.U): "must not",
    re.compile(r"\bmustn't've\b", re.I | re.U): "must not have",
    re.compile(r"\bmust've\b", re.I | re.U): "must have",
    re.compile(r"\b'neath\b", re.I | re.U): "beneath",
    re.compile(r"\bneedn't\b", re.I | re.U): "need not",
    re.compile(r"\bnal\b", re.I | re.U): "and all",
    re.compile(r"\bne'er\b", re.I | re.U): "never",
    re.compile(r"\bo'clock\b", re.I | re.U): "of the clock",
    re.compile(r"\bo'er\b", re.I | re.U): "over",
    re.compile(r"\bol'\b", re.I | re.U): "old",
    re.compile(r"\bought've\b", re.I | re.U): "ought have",
    re.compile(r"\boughtn't\b", re.I | re.U): "ought not",
    re.compile(r"\boughtn't've\b", re.I | re.U): "ought not have",
    re.compile(r"\b'round\b", re.I | re.U): "around",
    re.compile(r"\b's\b", re.I | re.U): "is",
    re.compile(r"\bshalln't\b", re.I | re.U): "shall not",
    re.compile(r"\bshan'\b", re.I | re.U): "shall not",
    re.compile(r"\bshan't\b", re.I | re.U): "shall not",
    re.compile(r"\bshould've\b", re.I | re.U): "should have",
    re.compile(r"\bshouldn't\b", re.I | re.U): "should not",
    re.compile(r"\bshouldn't've\b", re.I | re.U): "should not have",
    re.compile(r"\bso're\b", re.I | re.U): "so are",
    re.compile(r"\bso've\b", re.I | re.U): "so have",
    re.compile(r"\bthat're\b", re.I | re.U): "that are",
    re.compile(r"\bthere're\b", re.I | re.U): "there are",
    re.compile(r"\bthese're\b", re.I | re.U): "these are",
    re.compile(r"\bthese've\b", re.I | re.U): "these have",
    re.compile(r"\bthey've\b", re.I | re.U): "they have",
    re.compile(r"\bthose're\b", re.I | re.U): "those are",
    re.compile(r"\bthose've\b", re.I | re.U): "those have",
    re.compile(r"\b'thout\b", re.I | re.U): "without",
    re.compile(r"\b'til\b", re.I | re.U): "until",
    re.compile(r"\b'tis\b", re.I | re.U): "it is",
    re.compile(r"\bto've\b", re.I | re.U): "to have",
    re.compile(r"\btryna\b", re.I | re.U): "trying to",
    re.compile(r"\b'twas\b", re.I | re.U): "it was",
    re.compile(r"\b'tween\b", re.I | re.U): "between",
    re.compile(r"\b'twere\b", re.I | re.U): "it were",
    re.compile(r"\bw'all\b", re.I | re.U): "we all",
    re.compile(r"\bw'at\b", re.I | re.U): "we at",
    re.compile(r"\bwanna\b", re.I | re.U): "want to",
    re.compile(r"\bwasn't\b", re.I | re.U): "was not",
    re.compile(r"\bwe'd've\b", re.I | re.U): "we would have",
    re.compile(r"\bwe're\b", re.I | re.U): "we are",
    re.compile(r"\bwe've\b", re.I | re.U): "we have",
    re.compile(r"\bweren't\b", re.I | re.U): "were not",
    re.compile(r"\bwhatcha\b", re.I | re.U): "what are you",
    re.compile(r"\bwhat'd\b", re.I | re.U): "what did",
    re.compile(r"\bwhat've\b", re.I | re.U): "what have",
    re.compile(r"\bwhen'd\b", re.I | re.U): "when did",
    re.compile(r"\bwhere'd\b", re.I | re.U): "where did",
    re.compile(r"\bwhere're\b", re.I | re.U): "where are",
    re.compile(r"\bwhere've\b", re.I | re.U): "where have",
    re.compile(r"\bwhich're\b", re.I | re.U): "which are",
    re.compile(r"\bwhich've\b", re.I | re.U): "which have",
    re.compile(r"\bwho'd've\b", re.I | re.U): "who would have",
    re.compile(r"\bwho're\b", re.I | re.U): "who are",
    re.compile(r"\bwho've\b", re.I | re.U): "who have",
    re.compile(r"\bwhy'd\b", re.I | re.U): "why did",
    re.compile(r"\bwhy're\b", re.I | re.U): "why are",
    re.compile(r"\bwilln't\b", re.I | re.U): "will not",
    re.compile(r"\bwon't\b", re.I | re.U): "will not",
    re.compile(r"\bwonnot\b", re.I | re.U): "will not",
    re.compile(r"\bwould've\b", re.I | re.U): "would have",
    re.compile(r"\bwouldn't\b", re.I | re.U): "would not",
    re.compile(r"\bwouldn't've\b", re.I | re.U): "would not have",
    re.compile(r"\by'all\b", re.I | re.U): "you all",
    re.compile(r"\by'all'd've\b", re.I | re.U): "you all would have",
    re.compile(r"\by'all'dn't've\b", re.I | re.U): "you all would not have",
    re.compile(r"\by'all're\b", re.I | re.U): "you all are",
    re.compile(r"\by'all'ren't\b", re.I | re.U): "you all are not",
    re.compile(r"\by'at\b", re.I | re.U): "you at",
    re.compile(r"\byes'm\b", re.I | re.U): "yes madam",
    re.compile(r"\byever\b", re.I | re.U): "have you ever",
    re.compile(r"\by'know\b", re.I | re.U): "you know",
    re.compile(r"\byessir\b", re.I | re.U): "yes sir",
    re.compile(r"\byou're\b", re.I | re.U): "you are",
    re.compile(r"\byou've\b", re.I | re.U): "you have",
    re.compile(r"\bhow's\b", re.I | re.U): "is",
    re.compile(r"\bI'd\b", re.I | re.U): "I would",
    re.compile(r"\bI'll\b", re.I | re.U): "I will",
    re.compile(r"\bit'll\b", re.I | re.U): "it will",
    re.compile(r"\bit's\b", re.I | re.U): "it is",
    re.compile(r"\bshe'd\b", re.I | re.U): "she would",
    re.compile(r"\bshe'll\b", re.I | re.U): "she will",
    re.compile(r"\bshe's\b", re.I | re.U): "she is",
    re.compile(r"\bsomebody's\b", re.I | re.U): "somebody is",
    re.compile(r"\bsomeone's\b", re.I | re.U): "someone is",
    re.compile(r"\bsomething's\b", re.I | re.U): "something is",
    re.compile(r"\bso's\b", re.I | re.U): "so is",
    re.compile(r"\bthat'll\b", re.I | re.U): "that will",
    re.compile(r"\bthat's\b", re.I | re.U): "that is",
    re.compile(r"\bthat'd\b", re.I | re.U): "that would",
    re.compile(r"\bthere'd\b", re.I | re.U): "there would",
    re.compile(r"\bthere'll\b", re.I | re.U): "there will",
    re.compile(r"\bthere's\b", re.I | re.U): "there is",
    re.compile(r"\bthey'd\b", re.I | re.U): "they would",
    re.compile(r"\bthey'd've\b", re.I | re.U): "they would have",
    re.compile(r"\bthey'll\b", re.I | re.U): "they will",
    re.compile(r"\bthey're\b", re.I | re.U): "they are",
    re.compile(r"\bthis's\b", re.I | re.U): "this is",
    re.compile(r"\bwe'd\b", re.I | re.U): "we would",
    re.compile(r"\bwe'll\b", re.I | re.U): "we will",
    re.compile(r"\bwhat'll\b", re.I | re.U): "what will",
    re.compile(r"\bwhat're\b", re.I | re.U): "what are",
    re.compile(r"\bwhat's\b", re.I | re.U): "what is",
    re.compile(r"\bwhen's\b", re.I | re.U): "when is",
    re.compile(r"\bwhere'll\b", re.I | re.U): "where will",
    re.compile(r"\bwhere's\b", re.I | re.U): "where is",
    re.compile(r"\bwhich'd\b", re.I | re.U): "which would",
    re.compile(r"\bwhich'll\b", re.I | re.U): "which will",
    re.compile(r"\bwhich's\b", re.I | re.U): "which is",
    re.compile(r"\bwho'd\b", re.I | re.U): "who would",
    re.compile(r"\bwho'll\b", re.I | re.U): "who will",
    re.compile(r"\bwho's\b", re.I | re.U): "who is",
    re.compile(r"\bwhy's\b", re.I | re.U):  "why is'",
    re.compile(r"\by'ain't\b", re.I | re.U): "you are not",
    re.compile(r"\byou'd\b", re.I | re.U): "you would",
    re.compile(r"\byou'll\b", re.I | re.U): "you will"
}

In [58]:
# expand any contractions present in text
def expand_contractions(text):
    for pattern, substitute in contractions.items():
        updated_text = pattern.sub(substitute, text)

    return updated_text


df["review_text"] = df["review_text"].apply(expand_contractions)

df.head()

Unnamed: 0,star_rating,review_text,sentiment_score
0,1,and the shredder was dirty and the bin was par...,1
1,1,one star worked about a month then died,1
2,1,one star the phone did not work no dial tone n...,1
3,1,one star not laminated and no reinforced holes...,1
4,1,one star cartridge was over filled black smear...,1


Preprocessing

In [59]:
from nltk.tokenize import word_tokenize

df["tokenised_review_text"] = df["review_text"].apply(word_tokenize)

df.head()

Unnamed: 0,star_rating,review_text,sentiment_score,tokenised_review_text
0,1,and the shredder was dirty and the bin was par...,1,"[and, the, shredder, was, dirty, and, the, bin..."
1,1,one star worked about a month then died,1,"[one, star, worked, about, a, month, then, died]"
2,1,one star the phone did not work no dial tone n...,1,"[one, star, the, phone, did, not, work, no, di..."
3,1,one star not laminated and no reinforced holes...,1,"[one, star, not, laminated, and, no, reinforce..."
4,1,one star cartridge was over filled black smear...,1,"[one, star, cartridge, was, over, filled, blac..."


Word2Vec

Google W2V

In [60]:
import gensim.downloader as api
from gensim.models import Word2Vec, KeyedVectors

google_w2v: Word2Vec | KeyedVectors = api.load("word2vec-google-news-300")

In [61]:
print(
    google_w2v.most_similar(
        positive=["nice", "happy"],
        negative=["bad"], topn=5
    )
)

[('glad', 0.6133241057395935), ('thrilled', 0.5700709223747253), ('excited', 0.537404477596283), ('delighted', 0.5340354442596436), ('pleased', 0.5313177108764648)]


In [62]:
print(google_w2v.similarity("best", "better"))

0.52264345


Custom W2V

In [63]:
custom_w2v = Word2Vec(
    sentences=df["tokenised_review_text"],
    workers=1,
    vector_size=300,
    window=11
)
custom_w2v = custom_w2v.wv

In [64]:
print(
    custom_w2v.most_similar(
        positive=["nice", "happy"],
        negative=["bad"], topn=5
    )
)

[('pleased', 0.580795407295227), ('satisfied', 0.5139424800872803), ('thrilled', 0.47956201434135437), ('disapointed', 0.45218977332115173), ('lovely', 0.4362533390522003)]


In [65]:
print(custom_w2v.similarity("best", "better"))

0.3370654


In [66]:
def getMeanW2VFeatures(tokens, w2v_model: Word2Vec | KeyedVectors):
    if (len(tokens) == 0):
        v = torch.zeros(300)
        v.to(torch.float32)
        return v

    return w2v_model.get_mean_vector(tokens, ignore_missing=True)


def getConcatW2VFeatures(tokens, w2v_model: Word2Vec | KeyedVectors):
    series = []
    for i in tokens:
        if i in w2v_model:
            series.append(w2v_model[i])

        if len(series) == 10:
            break

    if len(series) != 10:
        remaining_len = 10 - len(series)
        for i in range(remaining_len):
            v = torch.zeros(300)
            v.to(torch.float32)
            series.append(v)

    series = np.array([j for i in series for j in i])

    return series


def getFeaturesDataFrame(series: pd.Series):
    return pd.DataFrame.from_records(series.values)


def filterNeutralReviewsMeanFeatures(df: pd.DataFrame, w2v_model: Word2Vec | KeyedVectors):
    df = df.loc[df["sentiment_score"] != 3]
    X = df["tokenised_review_text"].apply(
        lambda x: getMeanW2VFeatures(x, w2v_model)
    )
    Y = df["sentiment_score"]
    return X, Y


def filterNeutralReviewsConcatFeatures(df: pd.DataFrame, w2v_model: Word2Vec | KeyedVectors):
    df = df.loc[df["sentiment_score"] != 3]
    X = df["tokenised_review_text"].apply(
        lambda x: getConcatW2VFeatures(x, w2v_model)
    )

    Y = df["sentiment_score"]
    return X, Y


def getAllReviewsMeanFeatures(df: pd.DataFrame, w2v_model: Word2Vec | KeyedVectors):
    X = df["tokenised_review_text"].apply(
        lambda x: getMeanW2VFeatures(x, w2v_model)
    )

    Y = df["sentiment_score"]
    return X, Y


def getAllReviewsConcatFeatures(df: pd.DataFrame, w2v_model: Word2Vec | KeyedVectors):
    X = df["tokenised_review_text"].apply(
        lambda x: getConcatW2VFeatures(x, w2v_model)
    )

    Y = df["sentiment_score"]
    return X, Y


def getTrainTestSplit(X, Y):
    return train_test_split(X, Y, test_size=0.2, random_state=10)


def get_accuracy(actual, predicted):
    report = classification_report(actual, predicted, output_dict=True)
    accuracy = report["accuracy"]

    return accuracy

In [67]:
report_columns = [
    "model",
    "classification_type",
    "vector_type",
    "w2v_model_type",
    "accuracy"
]
final_report = pd.DataFrame(columns=report_columns)


def add_report_entry(model, classification_type, vector_type, w2v_model_type, accuracy):
    global final_report
    data = {
        "model": model,
        "classification_type": classification_type,
        "vector_type": vector_type,
        "w2v_model_type": w2v_model_type,
        "accuracy": accuracy
    }

    final_report.loc[len(final_report)] = data

Binary Classification

Perceptron

Perceptron - Pretrained

In [68]:
from sklearn.linear_model import Perceptron

In [69]:
X, Y = filterNeutralReviewsMeanFeatures(df, google_w2v)
X = getFeaturesDataFrame(X)

X_train, X_test, Y_train, Y_test = getTrainTestSplit(X, Y)

classifier = Perceptron(random_state=5)

classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

accuracy = get_accuracy(Y_test, Y_pred)

add_report_entry("Perceptron", "Binary", "Mean", "Pretrained", accuracy)

print("Perceptron Scores: ")
print("Pretained model: ")
print("Accuracy: ", accuracy)

Perceptron Scores: 
Pretained model: 
Accuracy:  0.7333


Perceptron - Custom

In [70]:
X, Y = filterNeutralReviewsMeanFeatures(df, custom_w2v)
X = getFeaturesDataFrame(X)

X_train, X_test, Y_train, Y_test = getTrainTestSplit(X, Y)

classifier = Perceptron(random_state=5)

classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

accuracy = get_accuracy(Y_test, Y_pred)

add_report_entry("Perceptron", "Binary", "Mean", "Custom", accuracy)

print("Perceptron Scores: ")
print("Pretained model: ")
print("Accuracy: ", accuracy)

Perceptron Scores: 
Pretained model: 
Accuracy:  0.8861


SVM

In [71]:
from sklearn.svm import LinearSVC

SVM - Pretrained

In [72]:
X, Y = filterNeutralReviewsMeanFeatures(df, google_w2v)
X = getFeaturesDataFrame(X)

X_train, X_test, Y_train, Y_test = getTrainTestSplit(X, Y)

classifier = LinearSVC(random_state=5, dual=True)

classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

accuracy = get_accuracy(Y_test, Y_pred)

add_report_entry("SVM", "Binary", "Mean", "Pretrained", accuracy)

print("SVM Scores: ")
print("Pretained model: ")
print("Accuracy: ", accuracy)

SVM Scores: 
Pretained model: 
Accuracy:  0.88445


SVM - Custom

In [73]:
X, Y = filterNeutralReviewsMeanFeatures(df, custom_w2v)
X = getFeaturesDataFrame(X)

X_train, X_test, Y_train, Y_test = getTrainTestSplit(X, Y)

classifier = LinearSVC(random_state=5, dual=True)

classifier.fit(X_train, Y_train)

Y_pred = classifier.predict(X_test)

accuracy = get_accuracy(Y_test, Y_pred)

add_report_entry("SVM", "Binary", "Mean", "Custom", accuracy)

print("SVM Scores: ")
print("Custom model: ")
print("Accuracy: ", accuracy)

SVM Scores: 
Custom model: 
Accuracy:  0.910525


DataLoader

In [103]:
import torch
from torch.utils.data import Dataset, DataLoader


class CustomDataset(Dataset):
    def __init__(self, X, Y):
        self.X = X
        self.Y = Y

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index], self.Y[index]

In [104]:
def getDatasetAndDataLoader(X, Y):
    X_train, X_test, Y_train, Y_test = getTrainTestSplit(X, Y)

    train_dataset = CustomDataset(X_train.reset_index(drop=True), Y_train.reset_index(drop=True))
    test_dataset = CustomDataset(X_test.reset_index(drop=True), Y_test.reset_index(drop=True))
    
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    return train_dataloader, test_dataloader

FFNN

In [105]:
import torch.nn as nn


class FFNN(nn.Module):
    def __init__(self, input_features, output_features):
        super(FFNN, self).__init__()
        hidden_layer1_features = 50
        hidden_layer2_features = 10
        output_layer_features = output_features * 2

        self.input = nn.Linear(
            in_features=input_features,
            out_features=hidden_layer1_features
        )

        self.hidden1 = nn.Linear(
            in_features=hidden_layer1_features,
            out_features=hidden_layer2_features
        )

        self.hidden2 = nn.Linear(
            in_features=hidden_layer2_features,
            out_features=output_layer_features
        )

        self.output = nn.Linear(
            in_features=output_layer_features,
            out_features=output_features
        )

    def forward(self, x):
        x = self.input(x)
        x = nn.functional.relu(x)
        x = self.hidden1(x)
        x = nn.functional.relu(x)
        x = self.hidden2(x)
        x = nn.functional.relu(x)
        x = self.output(x)

        return x

In [106]:
def train_ffnn_model(dataloader, classes, input_feature_len=300):
    model = FFNN(input_feature_len, classes)

    learning_rate = 0.01
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    no_epochs = 100

    for epoch in range(no_epochs):
        train_loss = 0.0

        model.train()
        
        for data, target in dataloader:
            optimizer.zero_grad()

            output = model(data)

            loss = criterion(output, target - 1)

            loss.backward()

            optimizer.step()

            train_loss += loss.item() * data.size(0)

        train_loss = train_loss/len(dataloader.dataset)

        print(
            'Epoch: {} \tTraining Loss: {:.6f}'.format(
                epoch+1,
                train_loss,
            )
        )

    return model

In [107]:
def predict(model, dataloader):
    y_actual = []
    y_pred = []
    for data, actual in dataloader:
        outputs = model(data) 
        _, predicted = torch.max(outputs.data, 1)
        y_pred.append(predicted.cpu().item() + 1)
        y_actual.append(actual) 

    return np.array(y_actual), np.array(y_pred)

Binary Classification with mean vector (Pretrained)

In [108]:
X, Y = filterNeutralReviewsMeanFeatures(df, google_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 2)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Binary", "Mean", "Pretrained", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 0.692940
Epoch: 2 	Training Loss: 0.691403
Epoch: 3 	Training Loss: 0.686447
Epoch: 4 	Training Loss: 0.629459
Epoch: 5 	Training Loss: 0.446910
Epoch: 6 	Training Loss: 0.358433
Epoch: 7 	Training Loss: 0.331845
Epoch: 8 	Training Loss: 0.320533
Epoch: 9 	Training Loss: 0.313366
Epoch: 10 	Training Loss: 0.307571
Epoch: 11 	Training Loss: 0.302515
Epoch: 12 	Training Loss: 0.298090
Epoch: 13 	Training Loss: 0.293761
Epoch: 14 	Training Loss: 0.289751
Epoch: 15 	Training Loss: 0.285840
Epoch: 16 	Training Loss: 0.281922
Epoch: 17 	Training Loss: 0.278182
Epoch: 18 	Training Loss: 0.274485
Epoch: 19 	Training Loss: 0.270882
Epoch: 20 	Training Loss: 0.267308
Epoch: 21 	Training Loss: 0.264146
Epoch: 22 	Training Loss: 0.261026
Epoch: 23 	Training Loss: 0.258227
Epoch: 24 	Training Loss: 0.255778
Epoch: 25 	Training Loss: 0.253437
Epoch: 26 	Training Loss: 0.251016
Epoch: 27 	Training Loss: 0.249163
Epoch: 28 	Training Loss: 0.247307
Epoch: 29 	Training Loss: 0.2

Ternary Classification with mean vector (Pretrained)

In [109]:
X, Y = getAllReviewsMeanFeatures(df, google_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 3)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Ternary", "Mean", "Pretrained", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 1.056475
Epoch: 2 	Training Loss: 1.054947
Epoch: 3 	Training Loss: 1.054061
Epoch: 4 	Training Loss: 1.050595
Epoch: 5 	Training Loss: 1.015088
Epoch: 6 	Training Loss: 0.837232
Epoch: 7 	Training Loss: 0.746839
Epoch: 8 	Training Loss: 0.726208
Epoch: 9 	Training Loss: 0.715880
Epoch: 10 	Training Loss: 0.707310
Epoch: 11 	Training Loss: 0.699763
Epoch: 12 	Training Loss: 0.691826
Epoch: 13 	Training Loss: 0.685205
Epoch: 14 	Training Loss: 0.678978
Epoch: 15 	Training Loss: 0.672775
Epoch: 16 	Training Loss: 0.665975
Epoch: 17 	Training Loss: 0.658729
Epoch: 18 	Training Loss: 0.650618
Epoch: 19 	Training Loss: 0.640998
Epoch: 20 	Training Loss: 0.628896
Epoch: 21 	Training Loss: 0.616085
Epoch: 22 	Training Loss: 0.604748
Epoch: 23 	Training Loss: 0.595516
Epoch: 24 	Training Loss: 0.587288
Epoch: 25 	Training Loss: 0.580301
Epoch: 26 	Training Loss: 0.574120
Epoch: 27 	Training Loss: 0.568939
Epoch: 28 	Training Loss: 0.564610
Epoch: 29 	Training Loss: 0.5

Binary Classification with Concat Vector (Pretrained)

In [110]:
X, Y = filterNeutralReviewsConcatFeatures(df, google_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 2, 3000)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Binary", "Concat", "Pretrained", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 0.675285
Epoch: 2 	Training Loss: 0.335751
Epoch: 3 	Training Loss: 0.263348
Epoch: 4 	Training Loss: 0.248903
Epoch: 5 	Training Loss: 0.237827
Epoch: 6 	Training Loss: 0.227216
Epoch: 7 	Training Loss: 0.216635
Epoch: 8 	Training Loss: 0.205671
Epoch: 9 	Training Loss: 0.193936
Epoch: 10 	Training Loss: 0.181323
Epoch: 11 	Training Loss: 0.167864
Epoch: 12 	Training Loss: 0.153571
Epoch: 13 	Training Loss: 0.138774
Epoch: 14 	Training Loss: 0.123703
Epoch: 15 	Training Loss: 0.108493
Epoch: 16 	Training Loss: 0.093050
Epoch: 17 	Training Loss: 0.079160
Epoch: 18 	Training Loss: 0.065893
Epoch: 19 	Training Loss: 0.059706
Epoch: 20 	Training Loss: 0.057576
Epoch: 21 	Training Loss: 0.053008
Epoch: 22 	Training Loss: 0.048771
Epoch: 23 	Training Loss: 0.041851
Epoch: 24 	Training Loss: 0.036055
Epoch: 25 	Training Loss: 0.031889
Epoch: 26 	Training Loss: 0.029433
Epoch: 27 	Training Loss: 0.027157
Epoch: 28 	Training Loss: 0.022913
Epoch: 29 	Training Loss: 0.0

Ternary Classification with Concat Vector (Pretrained)

In [111]:
X, Y = getAllReviewsConcatFeatures(df, google_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 3, 3000)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Ternary", "Concat", "Pretrained", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 1.057946
Epoch: 2 	Training Loss: 0.747765
Epoch: 3 	Training Loss: 0.597455
Epoch: 4 	Training Loss: 0.530667
Epoch: 5 	Training Loss: 0.497497
Epoch: 6 	Training Loss: 0.476916
Epoch: 7 	Training Loss: 0.460385
Epoch: 8 	Training Loss: 0.445046
Epoch: 9 	Training Loss: 0.430345
Epoch: 10 	Training Loss: 0.415754
Epoch: 11 	Training Loss: 0.401227
Epoch: 12 	Training Loss: 0.387014
Epoch: 13 	Training Loss: 0.372539
Epoch: 14 	Training Loss: 0.358643
Epoch: 15 	Training Loss: 0.344914
Epoch: 16 	Training Loss: 0.331007
Epoch: 17 	Training Loss: 0.318146
Epoch: 18 	Training Loss: 0.304832
Epoch: 19 	Training Loss: 0.292612
Epoch: 20 	Training Loss: 0.280477
Epoch: 21 	Training Loss: 0.269755
Epoch: 22 	Training Loss: 0.259040
Epoch: 23 	Training Loss: 0.249535
Epoch: 24 	Training Loss: 0.241876
Epoch: 25 	Training Loss: 0.233010
Epoch: 26 	Training Loss: 0.224701
Epoch: 27 	Training Loss: 0.217165
Epoch: 28 	Training Loss: 0.211158
Epoch: 29 	Training Loss: 0.2

Binary Classification Mean Vector (Custom W2V)

In [112]:
X, Y = filterNeutralReviewsMeanFeatures(df, custom_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 2)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Binary", "Mean", "Custom", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 0.693985
Epoch: 2 	Training Loss: 0.692407
Epoch: 3 	Training Loss: 0.690630
Epoch: 4 	Training Loss: 0.676900
Epoch: 5 	Training Loss: 0.509735
Epoch: 6 	Training Loss: 0.327292
Epoch: 7 	Training Loss: 0.274400
Epoch: 8 	Training Loss: 0.258408
Epoch: 9 	Training Loss: 0.250640
Epoch: 10 	Training Loss: 0.246103
Epoch: 11 	Training Loss: 0.242816
Epoch: 12 	Training Loss: 0.239970
Epoch: 13 	Training Loss: 0.237517
Epoch: 14 	Training Loss: 0.235152
Epoch: 15 	Training Loss: 0.233216
Epoch: 16 	Training Loss: 0.231195
Epoch: 17 	Training Loss: 0.229397
Epoch: 18 	Training Loss: 0.227720
Epoch: 19 	Training Loss: 0.226091
Epoch: 20 	Training Loss: 0.224625
Epoch: 21 	Training Loss: 0.223275
Epoch: 22 	Training Loss: 0.222011
Epoch: 23 	Training Loss: 0.220729
Epoch: 24 	Training Loss: 0.219540
Epoch: 25 	Training Loss: 0.218470
Epoch: 26 	Training Loss: 0.217391
Epoch: 27 	Training Loss: 0.216410
Epoch: 28 	Training Loss: 0.215495
Epoch: 29 	Training Loss: 0.2

Ternary Classification Mean Vector (Custom W2V)

In [113]:
X, Y = getAllReviewsMeanFeatures(df, custom_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 3)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Ternary", "Mean", "Custom", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 1.055723
Epoch: 2 	Training Loss: 1.055381
Epoch: 3 	Training Loss: 1.055116
Epoch: 4 	Training Loss: 1.054350
Epoch: 5 	Training Loss: 1.050653
Epoch: 6 	Training Loss: 0.997074
Epoch: 7 	Training Loss: 0.762368
Epoch: 8 	Training Loss: 0.668196
Epoch: 9 	Training Loss: 0.652143
Epoch: 10 	Training Loss: 0.643720
Epoch: 11 	Training Loss: 0.636172
Epoch: 12 	Training Loss: 0.630059
Epoch: 13 	Training Loss: 0.624016
Epoch: 14 	Training Loss: 0.617206
Epoch: 15 	Training Loss: 0.611068
Epoch: 16 	Training Loss: 0.604781
Epoch: 17 	Training Loss: 0.597926
Epoch: 18 	Training Loss: 0.592089
Epoch: 19 	Training Loss: 0.588060
Epoch: 20 	Training Loss: 0.584192
Epoch: 21 	Training Loss: 0.581027
Epoch: 22 	Training Loss: 0.578001
Epoch: 23 	Training Loss: 0.575277
Epoch: 24 	Training Loss: 0.572727
Epoch: 25 	Training Loss: 0.570494
Epoch: 26 	Training Loss: 0.568179
Epoch: 27 	Training Loss: 0.566009
Epoch: 28 	Training Loss: 0.563898
Epoch: 29 	Training Loss: 0.5

Binary Classification with Concat Vector (Custom W2V)

In [114]:
X, Y = filterNeutralReviewsConcatFeatures(df, custom_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 2, 3000)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Binary", "Concat", "Custom", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 0.292561
Epoch: 2 	Training Loss: 0.220199
Epoch: 3 	Training Loss: 0.200544
Epoch: 4 	Training Loss: 0.184193
Epoch: 5 	Training Loss: 0.168443
Epoch: 6 	Training Loss: 0.153185
Epoch: 7 	Training Loss: 0.137913
Epoch: 8 	Training Loss: 0.122785
Epoch: 9 	Training Loss: 0.108976
Epoch: 10 	Training Loss: 0.097310
Epoch: 11 	Training Loss: 0.089066
Epoch: 12 	Training Loss: 0.083358
Epoch: 13 	Training Loss: 0.075546
Epoch: 14 	Training Loss: 0.068061
Epoch: 15 	Training Loss: 0.061479
Epoch: 16 	Training Loss: 0.058741
Epoch: 17 	Training Loss: 0.053324
Epoch: 18 	Training Loss: 0.047577
Epoch: 19 	Training Loss: 0.043699
Epoch: 20 	Training Loss: 0.041699
Epoch: 21 	Training Loss: 0.036995
Epoch: 22 	Training Loss: 0.034438
Epoch: 23 	Training Loss: 0.032354
Epoch: 24 	Training Loss: 0.029490
Epoch: 25 	Training Loss: 0.026532
Epoch: 26 	Training Loss: 0.023851
Epoch: 27 	Training Loss: 0.022361
Epoch: 28 	Training Loss: 0.022567
Epoch: 29 	Training Loss: 0.0

Ternary Classification with Concat Vector (Custom W2V)

In [115]:
X, Y = getAllReviewsConcatFeatures(df, custom_w2v)

train, test = getDatasetAndDataLoader(X, Y)

classifier = train_ffnn_model(train, 3, 3000)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("FFNN", "Ternary", "Concat", "Custom", accuracy)

print("FFNN Scores: ")
print("Google model: ")
print("Average Vector: ")
print("Accuracy: ", accuracy)

Epoch: 1 	Training Loss: 0.574646
Epoch: 2 	Training Loss: 0.475269
Epoch: 3 	Training Loss: 0.451373
Epoch: 4 	Training Loss: 0.432924
Epoch: 5 	Training Loss: 0.416508
Epoch: 6 	Training Loss: 0.400882
Epoch: 7 	Training Loss: 0.385926
Epoch: 8 	Training Loss: 0.371548
Epoch: 9 	Training Loss: 0.357348
Epoch: 10 	Training Loss: 0.344097
Epoch: 11 	Training Loss: 0.330990
Epoch: 12 	Training Loss: 0.318943
Epoch: 13 	Training Loss: 0.307344
Epoch: 14 	Training Loss: 0.297223
Epoch: 15 	Training Loss: 0.287606
Epoch: 16 	Training Loss: 0.278713
Epoch: 17 	Training Loss: 0.270143
Epoch: 18 	Training Loss: 0.263040
Epoch: 19 	Training Loss: 0.256589
Epoch: 20 	Training Loss: 0.250314
Epoch: 21 	Training Loss: 0.242613
Epoch: 22 	Training Loss: 0.236626
Epoch: 23 	Training Loss: 0.232268
Epoch: 24 	Training Loss: 0.226819
Epoch: 25 	Training Loss: 0.222134
Epoch: 26 	Training Loss: 0.215604
Epoch: 27 	Training Loss: 0.210329
Epoch: 28 	Training Loss: 0.205621
Epoch: 29 	Training Loss: 0.2

CNN

In [133]:
class CustomCNN(nn.Module):
    def __init__(self, input_length, embedding_features, output_features):
        super(CustomCNN, self).__init__()
        layer1_features = 50
        layer2_features = 10

        self.output_features = output_features

        self.conv1 = nn.Conv1d(
            in_channels=embedding_features,
            out_channels=layer1_features,
            kernel_size=3,
            padding=1
        )

        self.pool1 = nn.MaxPool1d(kernel_size=2)

        self.conv2 = nn.Conv1d(
            in_channels=layer1_features,
            out_channels=layer2_features,
            kernel_size=3,
            padding=1
        )

        self.pool2 = nn.MaxPool1d(kernel_size=2)

        self.output = nn.Linear(
            in_features=layer2_features * (input_length // 4),
            out_features=output_features
        )

        self.sigmoid = nn.Sigmoid()
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = x.permute(0, 2, 1) 

        x = self.conv1(x)
        x = nn.functional.relu(x)
        x = self.pool1(x)

        x = self.conv2(x)
        x = nn.functional.relu(x)
        x = self.pool2(x)

        x = x.view(x.size(0), -1)

        x = self.output(x)

        if self.output_features == 2:
            x = self.sigmoid(x)
        else:
            x = self.softmax(x)

        return x

In [134]:
def train_cnn_model(dataloader, input_length = 50, embedding_features=300, output_features=2):
    model = CustomCNN(input_length, embedding_features, output_features)

    learning_rate = 0.003
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    no_epochs = 100

    for epoch in range(no_epochs):
        train_loss = 0.0

        model.train()

        for data, target in dataloader:
            optimizer.zero_grad()

            output = model(data)

            loss = criterion(output, target - 1)

            loss.backward()

            optimizer.step()

            train_loss += loss.item() * data.size(0)

        train_loss = train_loss/len(dataloader.dataset)

        print(
            'Epoch: {} \tTraining Loss: {:.6f}'.format(
                epoch+1,
                train_loss,
            )
        )

    return model

In [135]:
def getCNNVectorEmbedding(tokens_series, w2v_model):
  word_embeddings = []
  for tokens_list in tokens_series:
    tokens = []
    
    if len(tokens_list) > 50:
      tokens = tokens_list[:50]
    else:
      current_len = len(tokens_list)
      remaining_len = 50 - current_len
      remaining_list = [''] * remaining_len
      tokens = tokens_list + remaining_list

    current_embedding = []
    for token in tokens:
      if token in w2v_model:
        current_embedding.append(w2v_model[token])
      else:
        current_embedding.append(np.zeros(300))
    
    word_embeddings.append(current_embedding)

  return torch.tensor(word_embeddings, dtype=torch.float32)

In [136]:
def getCNNDataLoader(X, Y):
    X_train, X_test, Y_train, Y_test = getTrainTestSplit(X, Y)

    train_dataset = CustomDataset(X_train, Y_train)
    test_dataset = CustomDataset(X_test, Y_test)
    
    train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=False)
    test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=False)

    return train_dataloader, test_dataloader

CNN Binary Classification (Pretrained)

In [137]:
X = df.loc[df["sentiment_score"] != 3, "tokenised_review_text"]
X = getCNNVectorEmbedding(X, google_w2v)
Y = df.loc[df["sentiment_score"] != 3, "sentiment_score"]
Y = torch.tensor(Y, dtype=torch.long)

train, test = getCNNDataLoader(X, Y)

classifier = train_cnn_model(train)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("CNN", "Binary", "50x300", "Pretrained", accuracy)

print(accuracy)

Epoch: 1 	Training Loss: 0.676508
Epoch: 2 	Training Loss: 0.486140
Epoch: 3 	Training Loss: 0.413393
Epoch: 4 	Training Loss: 0.400295
Epoch: 5 	Training Loss: 0.394057
Epoch: 6 	Training Loss: 0.389876
Epoch: 7 	Training Loss: 0.386685
Epoch: 8 	Training Loss: 0.384098
Epoch: 9 	Training Loss: 0.381912
Epoch: 10 	Training Loss: 0.379989
Epoch: 11 	Training Loss: 0.378261
Epoch: 12 	Training Loss: 0.376696
Epoch: 13 	Training Loss: 0.375267
Epoch: 14 	Training Loss: 0.373947
Epoch: 15 	Training Loss: 0.372668
Epoch: 16 	Training Loss: 0.371482
Epoch: 17 	Training Loss: 0.370360
Epoch: 18 	Training Loss: 0.369312
Epoch: 19 	Training Loss: 0.368312
Epoch: 20 	Training Loss: 0.367360
Epoch: 21 	Training Loss: 0.366435
Epoch: 22 	Training Loss: 0.365532
Epoch: 23 	Training Loss: 0.364651
Epoch: 24 	Training Loss: 0.363822
Epoch: 25 	Training Loss: 0.363022
Epoch: 26 	Training Loss: 0.362257
Epoch: 27 	Training Loss: 0.361524
Epoch: 28 	Training Loss: 0.360855
Epoch: 29 	Training Loss: 0.3

CNN Binary Classification (Custom)

In [138]:
X = df.loc[df["sentiment_score"] != 3, "tokenised_review_text"]
X = getCNNVectorEmbedding(X, custom_w2v)
Y = df.loc[df["sentiment_score"] != 3, "sentiment_score"]
Y = torch.tensor(Y, dtype=torch.long)

train, test = getCNNDataLoader(X, Y)

classifier = train_cnn_model(train)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("CNN", "Binary", "50x300", "Custom", accuracy)

print(accuracy)

Epoch: 1 	Training Loss: 0.437417
Epoch: 2 	Training Loss: 0.382830
Epoch: 3 	Training Loss: 0.376285
Epoch: 4 	Training Loss: 0.372148
Epoch: 5 	Training Loss: 0.369024
Epoch: 6 	Training Loss: 0.366459
Epoch: 7 	Training Loss: 0.364244
Epoch: 8 	Training Loss: 0.362228
Epoch: 9 	Training Loss: 0.360408
Epoch: 10 	Training Loss: 0.358835
Epoch: 11 	Training Loss: 0.357646
Epoch: 12 	Training Loss: 0.356532
Epoch: 13 	Training Loss: 0.355769
Epoch: 14 	Training Loss: 0.355216
Epoch: 15 	Training Loss: 0.354662
Epoch: 16 	Training Loss: 0.353713
Epoch: 17 	Training Loss: 0.352966
Epoch: 18 	Training Loss: 0.352665
Epoch: 19 	Training Loss: 0.351545
Epoch: 20 	Training Loss: 0.351643
Epoch: 21 	Training Loss: 0.350668
Epoch: 22 	Training Loss: 0.350253
Epoch: 23 	Training Loss: 0.349516
Epoch: 24 	Training Loss: 0.348925
Epoch: 25 	Training Loss: 0.348492
Epoch: 26 	Training Loss: 0.348157
Epoch: 27 	Training Loss: 0.347545
Epoch: 28 	Training Loss: 0.347262
Epoch: 29 	Training Loss: 0.3

CNN Ternary Classification (Pretrained)

In [139]:
X = getCNNVectorEmbedding(df["tokenised_review_text"], google_w2v)
Y = torch.tensor(df["sentiment_score"], dtype=torch.long)

train, test = getCNNDataLoader(X, Y)

classifier = train_cnn_model(train, output_features=3)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("CNN", "Ternary", "50x300", "Pretrained", accuracy)

print(accuracy)

Epoch: 1 	Training Loss: 0.984988
Epoch: 2 	Training Loss: 0.830215
Epoch: 3 	Training Loss: 0.814236
Epoch: 4 	Training Loss: 0.780836
Epoch: 5 	Training Loss: 0.748311
Epoch: 6 	Training Loss: 0.737056
Epoch: 7 	Training Loss: 0.730275
Epoch: 8 	Training Loss: 0.725284
Epoch: 9 	Training Loss: 0.721329
Epoch: 10 	Training Loss: 0.718084
Epoch: 11 	Training Loss: 0.715435
Epoch: 12 	Training Loss: 0.713132
Epoch: 13 	Training Loss: 0.711093
Epoch: 14 	Training Loss: 0.709252
Epoch: 15 	Training Loss: 0.707556
Epoch: 16 	Training Loss: 0.705957
Epoch: 17 	Training Loss: 0.704509
Epoch: 18 	Training Loss: 0.703131
Epoch: 19 	Training Loss: 0.701749
Epoch: 20 	Training Loss: 0.700552
Epoch: 21 	Training Loss: 0.699552
Epoch: 22 	Training Loss: 0.698885
Epoch: 23 	Training Loss: 0.698664
Epoch: 24 	Training Loss: 0.698677
Epoch: 25 	Training Loss: 0.698108
Epoch: 26 	Training Loss: 0.696847
Epoch: 27 	Training Loss: 0.695214
Epoch: 28 	Training Loss: 0.693951
Epoch: 29 	Training Loss: 0.6

CNN Ternary Classification (Custom)

In [140]:
X = getCNNVectorEmbedding(df["tokenised_review_text"], custom_w2v)
Y = torch.tensor(df["sentiment_score"], dtype=torch.long)

train, test = getCNNDataLoader(X, Y)

classifier = train_cnn_model(train, output_features=3)

actual, prediction = predict(classifier, test)

accuracy = get_accuracy(actual, prediction)

add_report_entry("CNN", "Ternary", "50x300", "Custom", accuracy)

print(accuracy)

Epoch: 1 	Training Loss: 0.790708
Epoch: 2 	Training Loss: 0.723843
Epoch: 3 	Training Loss: 0.714784
Epoch: 4 	Training Loss: 0.709414
Epoch: 5 	Training Loss: 0.705198
Epoch: 6 	Training Loss: 0.701657
Epoch: 7 	Training Loss: 0.698935
Epoch: 8 	Training Loss: 0.696338
Epoch: 9 	Training Loss: 0.694440
Epoch: 10 	Training Loss: 0.692637
Epoch: 11 	Training Loss: 0.691433
Epoch: 12 	Training Loss: 0.689899
Epoch: 13 	Training Loss: 0.689247
Epoch: 14 	Training Loss: 0.687790
Epoch: 15 	Training Loss: 0.686532
Epoch: 16 	Training Loss: 0.685216
Epoch: 17 	Training Loss: 0.683821
Epoch: 18 	Training Loss: 0.682526
Epoch: 19 	Training Loss: 0.681352
Epoch: 20 	Training Loss: 0.680586
Epoch: 21 	Training Loss: 0.679815
Epoch: 22 	Training Loss: 0.678739
Epoch: 23 	Training Loss: 0.679227
Epoch: 24 	Training Loss: 0.678325
Epoch: 25 	Training Loss: 0.677058
Epoch: 26 	Training Loss: 0.675638
Epoch: 27 	Training Loss: 0.674569
Epoch: 28 	Training Loss: 0.674393
Epoch: 29 	Training Loss: 0.6

In [147]:
final_report

Unnamed: 0,model,classification_type,vector_type,w2v_model_type,accuracy
0,Perceptron,Binary,Mean,Pretrained,0.7333
1,Perceptron,Binary,Mean,Custom,0.8861
2,SVM,Binary,Mean,Pretrained,0.88445
3,SVM,Binary,Mean,Custom,0.910525
4,FFNN,Binary,Mean,Pretrained,0.911825
5,FFNN,Ternary,Mean,Pretrained,0.7852
6,FFNN,Binary,Concat,Pretrained,0.88775
7,FFNN,Ternary,Concat,Pretrained,0.7361
8,FFNN,Binary,Mean,Custom,0.922875
9,FFNN,Ternary,Mean,Custom,0.80384
