### Animal Crossing - user reviews classification

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import string
from sklearn.linear_model import LogisticRegression

# This dataset is sourced from: https://www.kaggle.com/datasets/jessemostipak/animal-crossing?select=user_reviews.csv


#Function to remove punctuaiton
def remove_punctuation(text):
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

ani_cros_df = pd.read_csv('user_reviews.csv')
ani_cros_df.head()

Unnamed: 0,grade,user_name,text,date
0,4,mds27272,My gf started playing before me. No option to ...,2020-03-20
1,5,lolo2178,"While the game itself is great, really relaxin...",2020-03-20
2,0,Roachant,My wife and I were looking forward to playing ...,2020-03-20
3,0,Houndf,We need equal values and opportunities for all...,2020-03-20
4,0,ProfessorFox,BEWARE! If you have multiple people in your h...,2020-03-20


In [None]:
ani_cros_df["grade"].value_counts()

Unnamed: 0_level_0,count
grade,Unnamed: 1_level_1
0,1158
10,752
1,255
9,253
2,131
4,105
3,98
8,91
5,78
6,44


In [None]:
#Replacing NaN with empty string
ani_cros_df.replace(np.nan, "", inplace=True)

In [None]:
#Removing punctuation
ani_cros_df["text"] = ani_cros_df['text'].apply(remove_punctuation)

In [None]:
#Droping all entries with rating = 5, because they have neutral sentiment
ani_cros_df = ani_cros_df[ani_cros_df["grade"] != 5]
#short test:
sum(ani_cros_df["grade"] == 5)

0

In [None]:
#Setting all positive ratings to 1, and all negative to -1
ani_cros_df["grade_bin"] = np.where(ani_cros_df["grade"] >= 5, 1, -1)

#Short test:
print((ani_cros_df["grade_bin"] == 1).sum(), (ani_cros_df["grade_bin"] == -1).sum())

1174 1747


In [None]:
np.unique(ani_cros_df["grade_bin"])

array([-1,  1])

In [None]:
from sklearn.model_selection import train_test_split
#Spliting dataset into training and test sets
X = ani_cros_df["text"]
y = ani_cros_df["grade_bin"]
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=44)

In [None]:
#Transforming reviews into vectors
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
reviews_train = vectorizer.fit_transform(X_train)

In [None]:
vectorizer.get_feature_names_out().shape

(12015,)

In [None]:
reviews_train.shape

(2336, 12015)

In [None]:
reviews_test = vectorizer.transform(X_test)
reviews_test.todense().shape

(585, 12015)

In [None]:
#Training LogisticRegression model on training data
model = LogisticRegression(max_iter=300)
model.fit(reviews_train, y_train)

In [None]:
len(model.coef_[0])

12015

In [None]:
len(vectorizer.get_feature_names_out())

12015

In [None]:
#Mapping coefficients to words
occurences = sorted(zip(model.coef_[0], vectorizer.get_feature_names_out()), key=lambda x: x[0])

### 10 words associated with negative rating

In [None]:
for coef, feature in occurences[0:10]:
    print(f'{ feature }: { coef }')

no: -1.2335411328798491
unacceptable: -1.2082238473566747
nintendo: -1.188986158980012
money: -1.1730695067842971
boring: -1.157847435525759
second: -1.1283950084628975
bought: -0.9940427333512383
after: -0.9789475496246314
when: -0.923216572507646
greedy: -0.895171084418852


### 10 words associated with positive rating

In [None]:
for coef, feature in occurences[-10:]:
    print(f'{ feature }: { coef }')

see: 0.926638738396134
rate: 0.941027795023186
love: 1.1650576670102757
amazing: 1.2379069438686616
fantastic: 1.3197464734451974
great: 1.3943782222876855
1010: 1.3965416481657664
relaxing: 1.4633170681352117
best: 1.5077657049848678
perfect: 1.6034595445236615


In [None]:
#Predicting the sentiment of test data reviews
model.predict(reviews_test)

array([ 1,  1,  1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1, -1, -1,
        1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1, -1, -1,  1, -1,  1, -1,
       -1,  1, -1,  1,  1, -1,  1, -1, -1, -1, -1, -1, -1,  1,  1, -1,  1,
       -1,  1, -1,  1, -1, -1, -1,  1,  1,  1, -1, -1,  1, -1,  1, -1, -1,
       -1, -1, -1,  1,  1, -1, -1, -1,  1,  1,  1,  1, -1,  1, -1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1,  1, -1, -1,  1, -1,  1,
       -1, -1,  1,  1, -1,  1, -1, -1, -1,  1, -1, -1,  1, -1, -1,  1,  1,
       -1,  1, -1,  1,  1, -1,  1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1, -1, -1, -1,  1, -1,
       -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1,  1, -1,  1, -1,  1,  1,
       -1, -1, -1,  1,  1,  1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1,  1,
       -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1,  1, -1,  1,  1,  1, -1,
       -1, -1, -1, -1,  1

In [None]:
#Predicting the sentiment of test data reviews in terms of probability
predicted = model.predict_proba(reviews_test)

In [None]:
predicted

array([[0.00313755, 0.99686245],
       [0.01139451, 0.98860549],
       [0.00346978, 0.99653022],
       ...,
       [0.99509788, 0.00490212],
       [0.03446724, 0.96553276],
       [0.52600876, 0.47399124]])

## Most negative reviews

In [None]:
# finding 5 rows with the highest probability of having a negative review
negative_idx = sorted(zip(predicted.T[0], X_test.index), key=lambda x: x[0])[-5:]
# getting indexes of those rows
_, n_indices = zip(*negative_idx)

In [None]:
n_indices

(2652, 1974, 289, 1341, 2358)

In [None]:
X_test[X_test.index.isin(n_indices)]

Unnamed: 0,text
289,Terrible value for 360 Why 360 and not 60 Beca...
1341,I have 2 kids Both want to play this game so b...
1974,I just want to preface this with I am fine wit...
2652,The game only allows ONE ISLAND per switch con...
2358,DO NOT BUY THIS GAME IF YOU PLAN ON USING IT F...


### Worst review


In [None]:
X_test[n_indices[-1]]



## Most positive reviews

In [None]:
# finding 5 rows with the highest probability of having a positive review
positive_idx = sorted(zip(predicted.T[1], X_test.index), key=lambda x: x[0])[-5:]
_, p_indices = zip(*positive_idx)

In [None]:
p_indices

(242, 2273, 42, 189, 938)

In [None]:
X_test[X_test.index.isin(p_indices)]

Unnamed: 0,text
189,I mean I kind of understand why most of the ne...
938,This is the best game stop review bombing it T...
242,The user score is almost ridiculous Sure I und...
2273,Animal Crossing New Horizons is absolutely gor...
42,There are no words to describe the flawlessly ...


### Best review


In [None]:
X_test[p_indices[-1]]

'This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stopThis is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it  This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop review bombing it This is the best game stop 

## Accuracy

In [None]:
model.score(reviews_test, y_test)

0.9042735042735043

## Exercise 5
In this exercise we will limit the dictionary of CountVectorizer to the set of significant words, defined below.


a) Redo exercises 2-5 using limited dictionary.   
b) Check the impact of all the words from the dictionary.   
c) Compare accuracy of predictions and the time of evaluation.

In [None]:
#Redoing classification with limited dictionary
significant_words = ['love','great','easy','old','little','perfect','loves','well','able','car','broke','less','even','waste','disappointed','work','product','money','would','return']

In [None]:
X = ani_cros_df["text"]
y = ani_cros_df["grade_bin"]

# splitting data
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(X, y, train_size=0.8, random_state=44)

# vectorizer is only going to look for words contained in significant_words
vectorizer_light = CountVectorizer(vocabulary=significant_words)
reviews_train_l = vectorizer_light.fit_transform(X_train_l)
vectorizer_light.get_feature_names_out()

array(['love', 'great', 'easy', 'old', 'little', 'perfect', 'loves',
       'well', 'able', 'car', 'broke', 'less', 'even', 'waste',
       'disappointed', 'work', 'product', 'money', 'would', 'return'],
      dtype=object)

In [None]:
reviews_test_l = vectorizer_light.transform(X_test_l)

In [None]:
reviews_test_l.todense().shape # we are working with 20 words, so this array has 20 columns

(585, 20)

In [None]:
#Training logistic regression
model_light = LogisticRegression(max_iter=300)
model_light.fit(reviews_train_l, y_train_l)

In [None]:
#Connecting features with coefficients and sorting from lowest to highest coef
occurences = sorted(zip(model_light.coef_[0], vectorizer_light.get_feature_names_out()), key=lambda x: x[0])

In [None]:
for coef, feature in occurences:
    print(f'{ feature }: { coef }')

money: -1.4858414975309233
disappointed: -1.1768496752164448
waste: -1.0502978951568702
product: -0.6229546209846784
car: -0.527459164670572
return: -0.5270878059739598
loves: -0.4539100226058944
less: -0.44483683779708905
even: -0.3287510010288856
able: -0.2753043861636732
would: -0.2553909349372798
old: -0.13779444431635426
well: 0.20510290366465392
work: 0.2169415711713223
broke: 0.2959783215902726
love: 0.4844431580625215
little: 0.5512220352168566
easy: 0.5931776274219871
great: 0.6190669962848168
perfect: 1.380908679474875


<ul>
<li>First 5 words with negative coefficients will be associated with negative review</li>
<li>Last 5 words with positive coefficients will be associated with positive review</li>


In [None]:
#Predicting the sentiment of test data reviews
model_light.predict(reviews_test_l)

array([-1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1,  1, -1,  1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1,  1,
       -1,  1, -1, -1,  1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1,  1, -1, -1, -1,
       -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
        1,  1, -1, -1, -1, -1,  1,  1,  1, -1, -1,  1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1,  1, -1, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1,  1, -1, -1, -1, -1, -1, -1,  1, -1, -1, -1,
       -1, -1, -1, -1, -1, -1, -1, -1, -1,  1,  1, -1, -1, -1, -1, -1, -1,
       -1, -1, -1, -1,  1,  1,  1, -1, -1, -1, -1, -1,  1,  1,  1, -1, -1,
       -1,  1, -1,  1, -1

In [None]:
#Predicting the sentiment of test data reviews in terms of probability
predicted = model_light.predict_proba(reviews_test_l)
predicted

array([[0.68609899, 0.31390101],
       [0.59117996, 0.40882004],
       [0.43776896, 0.56223104],
       ...,
       [0.86467646, 0.13532354],
       [0.59117996, 0.40882004],
       [0.4545318 , 0.5454682 ]])

### Best reviews

In [None]:
#Finding indices of most positive reviews
positive_idx = sorted(zip(predicted.T[1], X_test_l.index), key=lambda x: x[0])[-5:]
_, p_indices = zip(*positive_idx)

In [None]:
p_indices

(1532, 189, 2918, 2123, 2296)

In [None]:
X_test_l[X_test_l.index.isin(p_indices)]

Unnamed: 0,text
189,I mean I kind of understand why most of the ne...
2123,I am the only person playing on my Switch so t...
2296,I created an account on metacritic just to pos...
1532,it’s a great way to pass the time of being at ...
2918,Countering all the undeserved and blind hate ...


In [None]:
#The best review
X_test_l[p_indices[-1]]

'I created an account on metacritic just to post this review  The limit of one island per console is utterly ridiculous  Was there no thought put into this by Nintendo  My 9 year old daughter was the first to play this game and as one can expect the island is littered with items with no thought to building a perfect little world  This is great for her she enjoys herself but theI created an account on metacritic just to post this review  The limit of one island per console is utterly ridiculous  Was there no thought put into this by Nintendo  My 9 year old daughter was the first to play this game and as one can expect the island is littered with items with no thought to building a perfect little world  This is great for her she enjoys herself but the rest of the members in my household have to suffer  We cannot contribute anything to her island and we are unable to create islands ourselves to build OUR visions of the perfect utopia  This needs to be fixed immediately as only one person 

### Worst reviews

In [None]:
#Getting indices of worst reviews
negative_idx = sorted(zip(predicted.T[0], X_test_l.index), key=lambda x: x[0])[-5:]
_, n_indices = zip(*negative_idx)

In [None]:
n_indices

(385, 1817, 1752, 1883, 595)

In [None]:
X_test_l[X_test_l.index.isin(n_indices)]

Unnamed: 0,text
1883,I don’t expect there to be different islands p...
595,As it is right now I have to give it a 0 since...
1817,The good is as most people say the graphics an...
1752,I bought this game exclusively for the local m...
385,Ive been excited for this game since the first...


In [None]:
#Wors review
X_test_l[n_indices[-1]]

'As it is right now I have to give it a 0 since you can only have 1 island per switch and my girlfriend is the primary player Since I can not create my own island without paying 360 for a new switch and new game it was a waste of money for me Theres absolutely no reason why in this day and age you should only be able to enjoy a game like this on a per console basis If they changeAs it is right now I have to give it a 0 since you can only have 1 island per switch and my girlfriend is the primary player Since I can not create my own island without paying 360 for a new switch and new game it was a waste of money for me Theres absolutely no reason why in this day and age you should only be able to enjoy a game like this on a per console basis If they change that so I can also make a new game then I will change my score but as it stands this seems borderline criminal anticonsumer and it appears to be a cash grab by Nintendo I can only imagine the problems this causes in households with chil

## Comparison of two models


In [None]:
#Simplified model
model_light.score(reviews_test_l, y_test_l)

0.694017094017094

In [None]:
#Oryginal model
model.score(reviews_test, y_test)

0.9042735042735043

In [None]:
import sys, time

In [None]:
%%time
%%timeit
#Simplified model
model_light.predict(reviews_test_l)

163 µs ± 83.2 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
CPU times: user 9.95 s, sys: 12.7 ms, total: 9.96 s
Wall time: 12.7 s


In [None]:
%%time
%%timeit
#Oryginal model
model.predict(reviews_test)
#After simplifying our model, the accuracy decreased by 23%, and the code runs 3,5 times faster per loop

572 µs ± 68.2 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
CPU times: user 3.54 s, sys: 77.4 ms, total: 3.61 s
Wall time: 4.72 s
