In [1]:
import gensim
import pandas as pd

### Reading and Exploring the Dataset
The dataset we are using here is a subset of Amazon reviews from the Sports & Outdoors category. The data is stored as a JSON file and can be read using pandas.

Link to the Dataset: http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Sports_and_Outdoors_5.json.gz

In [4]:
df = pd.read_json(r"C:\Users\jonas\OneDrive\Desktop\Work\Coding\NLP\Data\Sports_and_Outdoors_5.json", lines=True)
df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014"
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012"
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012"
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012"
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013"


In [5]:
df.shape

(296337, 9)

### Simple Preprocessing & Tokenization
The first thing to do for any data science task is to clean the data.
For NLP, we apply various processing like converting all the words to lower case, trimming spaces, removing punctuations. 
This is something we will do over here too.

Additionally, we can also remove stop words like 'and', 'or', 'is', 'the', 'a', 'an' and convert words to their root forms like 'running' to 'run'.

In [6]:
review_text = df.reviewText.apply(gensim.utils.simple_preprocess)

In [7]:
review_text

0         [this, came, in, on, time, and, am, veru, happ...
1         [had, factory, glock, tool, that, was, using, ...
2         [if, you, don, have, punch, or, would, like, t...
3         [this, works, no, better, than, any, punch, yo...
4         [purchased, this, thinking, maybe, need, speci...
                                ...                        
296332    [this, is, water, bottle, done, right, it, is,...
296333    [if, you, re, looking, for, an, insulated, wat...
296334    [this, hydracentials, sporty, oz, double, insu...
296335    [as, usual, received, this, item, free, in, ex...
296336    [hydracentials, insulated, oz, water, bottle, ...
Name: reviewText, Length: 296337, dtype: object

### Training the Word2Vec Model

Train the model for reviews. Use a window of size 10 i.e. 10 words before the present word and 10 words ahead. A sentence with at least 2 words should only be considered, configure this using min_count parameter.

Workers define how many CPU threads to be used.

#### Initialize the model

In [54]:
model = gensim.models.Word2Vec(
    window=10,
    min_count=2,
    workers=4,
)


#### Build Vocabulary

In [55]:
model.build_vocab(review_text, progress_per=1000)

#### Train the Word2Vec Model

In [56]:
model.train(review_text, total_examples=model.corpus_count, epochs=model.epochs)

(91343193, 121496535)

### Finding Similar Words and Similarity between words
https://radimrehurek.com/gensim/models/word2vec.html

In [13]:
model.wv["apple"].shape

(100,)

### Classification

Turn all individual words into vector and sum all of them up into a single vector

In [22]:
import numpy as np

def text_to_vector(text):
    # Split the text into tokens
    tokens = text.split()
    
    # Initialize an empty vector
    vector = np.zeros(model.vector_size)
    
    # Iterate over each token and add its corresponding word vector
    for token in tokens:
        if token in model.wv:
            vector += model.wv[token]
    
    return vector

In [23]:
df["vector"] = df["reviewText"].apply(text_to_vector)

Resample the distribution size

In [25]:
df.overall.value_counts()

5    188208
4     64809
3     24071
2     10204
1      9045
Name: overall, dtype: int64

In [28]:
min_samples = 9045


df_1 = df[df.overall==1].sample(min_samples, random_state=2022)
df_2 = df[df.overall==2].sample(min_samples, random_state=2022)
df_3 = df[df.overall==3].sample(min_samples, random_state=2022)
df_4 = df[df.overall==4].sample(min_samples, random_state=2022)
df_5 = df[df.overall==5].sample(min_samples, random_state=2022)

df_balanced = pd.concat([df_1, df_2, df_3, df_4, df_5],axis=0)
df_balanced.overall.value_counts()

1    9045
2    9045
3    9045
4    9045
5    9045
Name: overall, dtype: int64

In [29]:
df_balanced.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,vector
91191,AYY4PYXVRKJ7D,B00152R8Q6,"Marty J. Leake ""Marty Leake""","[0, 5]",The spikes on my bicycle wasn't compatible. I ...,1,It didn't work for my bike,1370044800,"06 1, 2013","[-8.227951467037201, 22.20028881728649, 14.223..."
69259,A2XT3Q4KMMR6R,B000R4HSW2,OmLord,"[0, 4]",I have never before been so utterly disappoint...,1,I hate this item!!!!!!,1380758400,"10 3, 2013","[1.0154871715931222, -28.47037695394829, 18.41..."
263662,A1I601AN1HJEXP,B007XFORCC,Amazon Customer,"[3, 5]","I ordered two of these items, from two differe...",1,Product quit working immediately,1367452800,"05 2, 2013","[-23.494620902463794, -2.9773764219135046, 7.2..."
132166,A3EK5DN2JDH7AK,B001MYGM5A,Zorro,"[0, 0]",I returned mine. I wanted something a bit heav...,1,Not too sturdy,1396396800,"04 2, 2014","[10.416978031396866, 2.1522437827661633, -8.03..."
19746,A1Z7OL22XV3JT8,B0009PUQ8M,D. Evans,"[7, 8]",buy yourself a mallot and a small dust broom/p...,1,Not really worth it,1247097600,"07 9, 2009","[4.26598484069109, 11.463842380791903, 7.08866..."


Train Test Split

In [67]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["vector"].values,
    df_balanced["overall"],
    random_state=2022,
    stratify = df_balanced["overall"],
    test_size=0.2
)


In [68]:
X_train_2d = np.stack(X_train)
X_test_2d = np.stack(X_test)

X_train_2d.shape

(36180, 100)

Building different ML Models

In [69]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report



#doing scaling because Negative values will not pass into Naive Bayes models
scaler = MinMaxScaler()
scaled_train = scaler.fit_transform(X_train_2d)
scaled_test = scaler.transform(X_test_2d)


In [70]:
from  sklearn.neighbors import KNeighborsClassifier


#1. creating a KNN model object
knnCLF = KNeighborsClassifier()


#2. fit with all_train_embeddings and y_train
knnCLF.fit(scaled_train, y_train)

#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = knnCLF.predict(scaled_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.34      0.49      0.40      1809
           2       0.25      0.26      0.25      1809
           3       0.26      0.26      0.26      1809
           4       0.26      0.21      0.23      1809
           5       0.45      0.30      0.36      1809

    accuracy                           0.31      9045
   macro avg       0.31      0.31      0.30      9045
weighted avg       0.31      0.31      0.30      9045



In [71]:
from sklearn.ensemble import RandomForestClassifier


#1. creating a Random Forest model object
clf = RandomForestClassifier()


#2. fit with all_train_embeddings and y_train
clf.fit(scaled_train, y_train)

#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(scaled_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.40      0.52      0.46      1809
           2       0.30      0.25      0.27      1809
           3       0.29      0.28      0.29      1809
           4       0.29      0.23      0.25      1809
           5       0.46      0.51      0.48      1809

    accuracy                           0.36      9045
   macro avg       0.35      0.36      0.35      9045
weighted avg       0.35      0.36      0.35      9045



In [43]:
from sklearn.ensemble import GradientBoostingClassifier


#1. creating a GradientBoosting model object
clf = GradientBoostingClassifier()


#2. fit with all_train_embeddings and y_train
clf.fit(scaled_train, y_train)

#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(scaled_test)


#4. print the classfication report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           1       0.42      0.52      0.47      2261
           2       0.31      0.26      0.29      2262
           3       0.32      0.31      0.31      2261
           4       0.33      0.26      0.29      2261
           5       0.48      0.55      0.51      2262

    accuracy                           0.38     11307
   macro avg       0.37      0.38      0.38     11307
weighted avg       0.37      0.38      0.38     11307



Adding weight to each word using TF-IDF before summing them together

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_vectors = vectorizer.fit_transform(df_balanced["reviewText"])

df_balanced["vector_id"] = [np.sum([model.wv[token] * tfidf_vectors[i, vectorizer.vocabulary_[token]]
                       for token in text.split() if token in model.wv], axis=0)
               for i, text in enumerate(df_balanced["reviewText"])]

Train Test Split

In [49]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df_balanced["vector_id"],
    df_balanced["overall"],
    random_state=2022,
    stratify = df_balanced["overall"]
)

Training the different ML Models

In [51]:
#1. creating a KNN model object
model = KNeighborsClassifier()


#2. fit with all_train_embeddings and y_train
model.fit(X_train_2d, y_train)

#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = model.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.34      0.51      0.41      2261
           2       0.24      0.25      0.25      2262
           3       0.24      0.25      0.24      2261
           4       0.27      0.23      0.25      2261
           5       0.46      0.28      0.35      2262

    accuracy                           0.30     11307
   macro avg       0.31      0.30      0.30     11307
weighted avg       0.31      0.30      0.30     11307



In [52]:
#1. creating a Random Forest model object
model = RandomForestClassifier()


#2. fit with all_train_embeddings and y_train
model.fit(X_train_2d, y_train)

#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = model.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.41      0.50      0.45      2261
           2       0.27      0.24      0.25      2262
           3       0.29      0.29      0.29      2261
           4       0.30      0.24      0.27      2261
           5       0.46      0.51      0.49      2262

    accuracy                           0.36     11307
   macro avg       0.35      0.36      0.35     11307
weighted avg       0.35      0.36      0.35     11307



In [53]:
#1. creating a GradientBoosting model object
model = GradientBoostingClassifier()


#2. fit with all_train_embeddings and y_train
model.fit(X_train_2d, y_train)

#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = model.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           1       0.42      0.52      0.46      2261
           2       0.30      0.26      0.28      2262
           3       0.32      0.30      0.31      2261
           4       0.33      0.27      0.29      2261
           5       0.47      0.55      0.51      2262

    accuracy                           0.38     11307
   macro avg       0.37      0.38      0.37     11307
weighted avg       0.37      0.38      0.37     11307



Using google news api and mean Word2Vec instead of sum

In [62]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')

[--------------------------------------------------] 0.4% 6.0/1662.8MB downloaded

KeyboardInterrupt: 

In [58]:
df.head(5)

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,vector
0,AIXZKN4ACSKI,1881509818,David Briner,"[0, 0]",This came in on time and I am veru happy with ...,5,Woks very good,1390694400,"01 26, 2014","[-22.2389500788413, 8.197773933410645, 10.2137..."
1,A1L5P841VIO02V,1881509818,Jason A. Kramer,"[1, 1]",I had a factory Glock tool that I was using fo...,5,Works as well as the factory tool,1328140800,"02 2, 2012","[-14.604479933157563, 4.497607171535492, 6.369..."
2,AB2W04NI4OEAD,1881509818,J. Fernald,"[2, 2]",If you don't have a 3/32 punch or would like t...,4,"It's a punch, that's all.",1330387200,"02 28, 2012","[21.604750588536263, -14.582560919225216, 8.98..."
3,A148SVSWKTJKU6,1881509818,"Jusitn A. Watts ""Maverick9614""","[0, 0]",This works no better than any 3/32 punch you w...,4,It's a punch with a Glock logo.,1328400000,"02 5, 2012","[-3.0750463902950287, -30.545307585038245, 3.6..."
4,AAAWJ6LW9WMOO,1881509818,Material Man,"[0, 0]",I purchased this thinking maybe I need a speci...,4,"Ok,tool does what a regular punch does.",1366675200,"04 23, 2013","[-4.436040550470352, 11.525207764469087, -9.03..."


In [60]:
import spacy
nlp = spacy.load("en_core_web_lg") # if this fails then run "python -m spacy download en_core_web_lg" to download that model

def preprocess_and_vectorize(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)
        
    return model.get_mean_vector(filtered_tokens)

  from .autonotebook import tqdm as notebook_tqdm


In [61]:
df['vector_mean'] = df['reviewText'].apply(lambda text: preprocess_and_vectorize(text))

AttributeError: 'Word2Vec' object has no attribute 'get_mean_vector'

In [None]:
from sklearn.model_selection import train_test_split


#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df.vector_mean.values, 
    df.overall, 
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.overall
)

In [None]:
print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)

print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

#1. creating a GradientBoosting model object
clf = GradientBoostingClassifier()

#2. fit with all_train_embeddings and y_train
clf.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))