<a href="https://colab.research.google.com/github/karolinakuligowska/TMSMM_codes/blob/main/KK_TMSMM_class_4_text_categorization_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# upload the dataset - Amazon reviews
import pandas as pd
import string
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import math
import warnings



In [2]:
raw_data = pd.read_csv("class4_amazon_reviews.csv")
print(raw_data)

                                 name  ...   standev
0    Philips Avent 3 Pack 9oz Bottles  ...  1.620651
1    Philips Avent 3 Pack 9oz Bottles  ...  1.620651
2    Philips Avent 3 Pack 9oz Bottles  ...  1.620651
3    Philips Avent 3 Pack 9oz Bottles  ...  1.620651
4    Philips Avent 3 Pack 9oz Bottles  ...  1.620651
..                                ...  ...       ...
186  Philips Avent 3 Pack 9oz Bottles  ...  1.620651
187  Philips Avent 3 Pack 9oz Bottles  ...  1.620651
188  Philips Avent 3 Pack 9oz Bottles  ...  1.620651
189  Philips Avent 3 Pack 9oz Bottles  ...  1.620651
190  Philips Avent 3 Pack 9oz Bottles  ...  1.620651

[191 rows x 5 columns]


In [3]:
# more elegant way
# deal with eventual warnings and set seedwarnings.filterwarnings('ignore') 
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore",category=UserWarning)
np.random.seed(7)


In [9]:
csv = "class4_amazon_reviews.csv"
df = pd.read_csv(csv)
df.head(10)

Unnamed: 0,name,review,rating,count,standev
0,Philips Avent 3 Pack 9oz Bottles,I was recommended to use these bottles by a gi...,5,191,1.620651
1,Philips Avent 3 Pack 9oz Bottles,If I had not been given a ton of Avent bottles...,2,191,1.620651
2,Philips Avent 3 Pack 9oz Bottles,Leaks! Especially difficult to get a tight sea...,1,191,1.620651
3,Philips Avent 3 Pack 9oz Bottles,I have been using the Avent bottle system for ...,5,191,1.620651
4,Philips Avent 3 Pack 9oz Bottles,I used Avent bottles with my son when he was t...,5,191,1.620651
5,Philips Avent 3 Pack 9oz Bottles,These bottles are simply the best out there. I...,5,191,1.620651
6,Philips Avent 3 Pack 9oz Bottles,I purchased these bottles for my older baby wh...,4,191,1.620651
7,Philips Avent 3 Pack 9oz Bottles,After reading the reviews of these bottles lea...,5,191,1.620651
8,Philips Avent 3 Pack 9oz Bottles,I really wanted to like these bottles because ...,1,191,1.620651
9,Philips Avent 3 Pack 9oz Bottles,"In the midst of all of my bottle testing, I wa...",5,191,1.620651


In [10]:
# more about ratings
data = df.copy()
data.describe()

Unnamed: 0,rating,count,standev
count,191.0,191.0,191.0
mean,3.204188,191.0,1.620651
std,1.620651,0.0,2.226282e-16
min,1.0,191.0,1.620651
25%,2.0,191.0,1.620651
50%,4.0,191.0,1.620651
75%,5.0,191.0,1.620651
max,5.0,191.0,1.620651


In [11]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 191 entries, 0 to 190
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     191 non-null    object 
 1   review   191 non-null    object 
 2   rating   191 non-null    int64  
 3   count    191 non-null    int64  
 4   standev  191 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 7.6+ KB


In [12]:
# the ratings on the Avent Bottles are quite spread on the extreme
# might be that people only write reviews if they are super excited or very frustrated with a product
# we want this to be a simple classification exercise (below)

In [13]:
# deal with eventual missing values
from sklearn.model_selection import StratifiedShuffleSplit
print("Before {}".format(len(data)))
dataAfter = data.dropna(subset=["rating"])

print("After {}".format(len(dataAfter)))
dataAfter["rating"] = dataAfter["rating"].astype(int)

Before 191
After 191


In [14]:
# split into train and test subsets
split = StratifiedShuffleSplit(n_splits=5, test_size=0.25)
for train_index, test_index in split.split(dataAfter,
                                           dataAfter["rating"]): 
    strat_train = dataAfter.reindex(train_index)
    strat_test = dataAfter.reindex(test_index)

In [15]:
# inspect the subsets

In [16]:
len(strat_train)

143

In [17]:
strat_train["rating"].value_counts()/len(strat_train)

5    0.342657
1    0.237762
2    0.174825
4    0.153846
3    0.090909
Name: rating, dtype: float64

In [18]:
len(strat_test)

48

In [19]:
strat_test["rating"].value_counts()/len(strat_test)

5    0.354167
1    0.229167
4    0.166667
2    0.166667
3    0.083333
Name: rating, dtype: float64

In [20]:
reviews = strat_train.copy()
reviews.head(2)

Unnamed: 0,name,review,rating,count,standev
157,Philips Avent 3 Pack 9oz Bottles,I had asked for these bottles when I was pregn...,5,191,1.620651
68,Philips Avent 3 Pack 9oz Bottles,i love this brand. they work great. they wil...,5,191,1.620651


In [21]:
reviews.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 143 entries, 157 to 36
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   name     143 non-null    object 
 1   review   143 non-null    object 
 2   rating   143 non-null    int64  
 3   count    143 non-null    int64  
 4   standev  143 non-null    float64
dtypes: float64(1), int64(2), object(2)
memory usage: 6.7+ KB


In [22]:
data.describe()

Unnamed: 0,rating,count,standev
count,191.0,191.0,191.0
mean,3.204188,191.0,1.620651
std,1.620651,0.0,2.226282e-16
min,1.0,191.0,1.620651
25%,2.0,191.0,1.620651
50%,4.0,191.0,1.620651
75%,5.0,191.0,1.620651
max,5.0,191.0,1.620651


In [23]:
# put a 'positive label' for great reviews (4 or 5) or a 'negative label' for bad reviews (1 or 2)
# all the reviews that have a rating of 3 considered as neutral

In [24]:
def sentiments(rating):
    if (rating == 5) or (rating == 4):
        return "Positive"
    elif rating == 3:
        return "Neutral"
    elif (rating == 2) or (rating == 1):
        return "Negative"
    
# add labels to the data
strat_train["Sentiment"] = strat_train["rating"].apply(sentiments)
strat_test["Sentiment"] = strat_test["rating"].apply(sentiments)
strat_train["Sentiment"][:20]

157    Positive
68     Positive
179    Positive
54     Negative
82     Positive
119    Positive
7      Positive
28     Positive
117    Positive
103    Positive
96      Neutral
152    Positive
20     Negative
187    Positive
99     Positive
10     Negative
70      Neutral
107    Positive
165    Positive
181    Positive
Name: Sentiment, dtype: object

In [25]:
# prepare data for the model
X_train = strat_train["review"]
X_train_targetRating = strat_train["rating"]
X_test = strat_test["review"]
X_test_targetRating = strat_test["rating"]
print(len(X_train), len(X_test))

143 48


In [26]:
# drop missing values, nan
X_train = X_train.fillna(' ')
X_test = X_test.fillna(' ')
X_train_targetRating = X_train_targetRating.fillna(' ')
X_test_targetRating = X_test_targetRating.fillna(' ')

In [27]:
# cleaning the text
# see the number of objects and distinct words
from sklearn.feature_extraction.text import CountVectorizer 
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train) 
X_train_counts.shape

(143, 1939)

In [28]:
# term frequencies (Tf) - to divide the number of occurrences for each word by total number of words
# term frequencies times inverse document frequency (Tfidf) - to downscale the weights of each word (assign less value to unimportant stop words)

In [29]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer(use_idf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(143, 1939)

In [30]:
# try with SVM
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

In [31]:
from sklearn.svm import LinearSVC
clf_linearSVC_pipe = Pipeline([("vect", CountVectorizer()), 
                               ("tfidf", TfidfTransformer()),
                               ("clf_linearSVC", LinearSVC())])
clf_linearSVC_pipe.fit(X_train, X_train_targetRating)

predictedLinearSVC = clf_linearSVC_pipe.predict(X_test)
np.mean(predictedLinearSVC == X_test_targetRating)

0.5