In [1]:
import numpy as np
import pandas as pd
import altair as alt
import function_deck
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from spacy.lang.en.stop_words import STOP_WORDS

In [2]:
reviews = function_deck.get_reviews('1448440')

In [3]:
reviews.shape


(2000, 26)

In [4]:
review_text = reviews[['review', 'review_type']]

In [5]:
review_text.head()

Unnamed: 0,review,review_type
0,"Was, is & probably still will be my biggest di...",Negative
1,Not as much replay value as Nioh at the moment...,Positive
2,smooth experience but way too easy in comparis...,Positive
3,Works on my machine ¯\_(ツ)_/¯,Positive
4,After finishing this game I was left feeling u...,Negative


In [6]:
list(STOP_WORDS)

['go',
 'such',
 'my',
 'whom',
 'nobody',
 'take',
 'ever',
 'therein',
 'she',
 'anyhow',
 'none',
 'so',
 'moreover',
 'must',
 'made',
 'never',
 'few',
 'since',
 'as',
 "'m",
 'became',
 'due',
 'something',
 'us',
 'am',
 'regarding',
 'several',
 'fifty',
 'here',
 'again',
 'say',
 'once',
 'just',
 'therefore',
 'have',
 'ourselves',
 'thence',
 'although',
 'why',
 'cannot',
 'third',
 'while',
 'front',
 'namely',
 'thru',
 'everything',
 'get',
 'towards',
 'off',
 'anyway',
 'serious',
 'toward',
 'within',
 '’ve',
 'until',
 'onto',
 'five',
 'however',
 'during',
 'further',
 '’re',
 'without',
 'them',
 'has',
 'otherwise',
 'indeed',
 'after',
 'part',
 'no',
 "'d",
 'somewhere',
 'thereupon',
 'elsewhere',
 'other',
 'from',
 'a',
 'and',
 'ten',
 'really',
 'he',
 'her',
 'sixty',
 'whereas',
 'not',
 'all',
 'else',
 'nine',
 'enough',
 'everyone',
 'mine',
 'n‘t',
 'at',
 'thereafter',
 'neither',
 're',
 'someone',
 'to',
 'beside',
 'than',
 'whereafter',
 'nowh

In [7]:
cv = CountVectorizer(min_df = 0.005, ngram_range = (2,3))
counts = cv.fit_transform(review_text['review'])

In [8]:

Stop_Words = STOP_WORDS.union({'ve', 'good', 'best', 'game', 'games', 'll', 'ass', 'feels' })
tfidf = TfidfVectorizer(min_df = 0.003, ngram_range = (2,3), stop_words = list(Stop_Words))
counts = tfidf.fit_transform(review_text['review'])

In [9]:
print(counts)

  (0, 663)	1.0
  (2, 651)	0.47114319779649483
  (2, 462)	0.5126611760708144
  (2, 105)	0.5023754813577905
  (2, 709)	0.5126611760708144
  (4, 75)	0.43282030556543133
  (4, 179)	0.4842901576561606
  (4, 395)	0.43761263283771684
  (4, 66)	0.4427357192331335
  (4, 103)	0.35603484735624646
  (4, 663)	0.2526837554169115
  (5, 618)	1.0
  (6, 728)	0.14144713519856358
  (6, 374)	0.14144713519856358
  (6, 473)	0.11913558740630696
  (6, 366)	0.11439318591418912
  (6, 725)	0.11439318591418912
  (6, 736)	0.1152607632498724
  (6, 370)	0.14144713519856358
  (6, 598)	0.1338668319526307
  (6, 236)	0.14144713519856358
  (6, 373)	0.14144713519856358
  (6, 384)	0.12829088466685407
  (6, 201)	0.09076321897547794
  (6, 570)	0.13610602669396743
  :	:
  (1994, 389)	0.2550691476721879
  (1994, 388)	0.2550691476721879
  (1994, 11)	0.242614653640606
  (1994, 659)	0.23318298082986424
  (1994, 642)	0.242614653640606
  (1994, 640)	0.242614653640606
  (1994, 514)	0.14883471186215788
  (1994, 82)	0.16806573701136757

In [10]:
nb = MultinomialNB()
nb.fit(counts, reviews['review_type'])

MultinomialNB()

In [11]:
words = tfidf.get_feature_names_out()
words = pd.Series(words).to_frame(name = 'word')
log_probs = pd.DataFrame(nb.feature_log_prob_)
polarity_score_table = (log_probs.iloc[1] - log_probs.iloc[0]).to_frame(name = 'polarity_score') 
polarity_score_table = pd.concat([words, polarity_score_table], axis = 1)
polarity_score_table = polarity_score_table.sort_values('polarity_score')

In [15]:
polarity_score_table.head(30)

Unnamed: 0,word,polarity_score
70,cant play,-2.403719
706,waste money,-1.986165
533,play nioh instead,-1.947908
707,waste time,-1.891165
704,wanted like,-1.883552
75,change review,-1.864851
668,terrible performance,-1.852693
166,don waste,-1.845874
167,don waste money,-1.712924
454,nioh instead,-1.710421


In [114]:
top_30_positive= polarity_score_table.tail(30)


                        word  polarity_score
64                 cant play       -2.406166
697              waste money       -1.988612
525        play nioh instead       -1.950355
698               waste time       -1.892194
695              wanted like       -1.885999
659     terrible performance       -1.862784
69             change review       -1.858507
160                don waste       -1.848321
161          don waste money       -1.715371
446             nioh instead       -1.712229
401             maybe better       -1.546411
524                play nioh       -1.525650
355          loading screens       -1.477354
468              online mode       -1.412662
130           despite having       -1.396199
379                love team       -1.393821
380          love team ninja       -1.393821
238              gets boring       -1.389749
418              morale rank       -1.373402
568  recommend current state       -1.364655
146               doesn work       -1.345382
505       

In [115]:
top_30_negative = polarity_score_table.head(30)

In [20]:

def get_polarities(reviews):
    review_text = reviews[['review', 'review_type']]
    Stop_Words = STOP_WORDS.union({'ve', 'good', 'best', 'game', 'games', 'll', 'ass', 'feels' })
    tfidf = TfidfVectorizer(min_df = 0.003, ngram_range = (2,3), stop_words = list(Stop_Words))
    counts = tfidf.fit_transform(review_text['review'])
    nb = MultinomialNB()
    nb.fit(counts, reviews['review_type'])
    words = tfidf.get_feature_names_out()
    words = pd.Series(words).to_frame(name = 'word')
    log_probs = pd.DataFrame(nb.feature_log_prob_)
    polarity_score_table = (log_probs.iloc[1] - log_probs.iloc[0]).to_frame(name = 'polarity_score') 
    polarity_score_table = pd.concat([words, polarity_score_table], axis = 1)
    polarity_score_table = polarity_score_table.sort_values('polarity_score')
    top_30_plus_minus = pd.concat([polarity_score_table.head(30), polarity_score_table.tail(30)])
    return top_30_plus_minus
    

In [17]:
def get_minus(polarities):
    return polarities.head(30)

def get_plus(polarities):
    return polarities.tail(30)

In [27]:
def make_polarity_chart(data):
    return alt.Chart(data).mark_bar().encode(x = 'polarity_score:Q', y = alt.X('word').sort('-x'),
                                             color = alt.condition(alt.datum.polarity_score > 0, 
                                                                   alt.value('steelblue'),
                                                                   alt.value('red')
                                                                  )).properties(height = 600)

In [21]:
get_minus(get_polarities(reviews))

Unnamed: 0,word,polarity_score
70,cant play,-2.403719
706,waste money,-1.986165
533,play nioh instead,-1.947908
707,waste time,-1.891165
704,wanted like,-1.883552
75,change review,-1.864851
668,terrible performance,-1.852693
166,don waste,-1.845874
167,don waste money,-1.712924
454,nioh instead,-1.710421


In [28]:
make_polarity_chart(get_minus(get_polarities(reviews)))

In [56]:
good_words = {'good', 'great', 'fantastic', 'awesome', 'best', 'terrific', 'amazing'}
bad_words = {'bad', 'terrible', 'ass', 'shit', 'trash', 'garbage', 'abysmal', 'awful', 'worst', 'worse', 'waste', 'horrible'
            'dissapointment', 'disappointed', 'disappointing'}
good_plus_bad = good_words.union(bad_words)

In [57]:
ug_stop_words = STOP_WORDS.union({'ve',  'game', 'games', 'll',  'feels' })
ug_stop_words = ug_stop_words.union(good_plus_bad)
ugmodel = TfidfVectorizer(min_df = 0.01, ngram_range = (1,1), stop_words = list(ug_stop_words))
ug_counts = ugmodel.fit_transform(review_text['review'])


In [58]:
nb.fit(ug_counts, review_text['review_type'])

MultinomialNB()

In [59]:
ug_words = ugmodel.get_feature_names_out()
ug_words = pd.Series(ug_words).to_frame(name = 'word')
ug_log_probs = pd.DataFrame(nb.feature_log_prob_)
ug_polarity_score_table = (ug_log_probs.iloc[1] - ug_log_probs.iloc[0]).to_frame(name = 'polarity_score') 
ug_polarity_score_table = pd.concat([ug_words, ug_polarity_score_table], axis = 1)
ug_polarity_score_table = ug_polarity_score_table.sort_values('polarity_score')

In [61]:
ug_polarity_score_table.tail(20)

Unnamed: 0,word,polarity_score
393,keyboard,1.050199
441,love,1.057936
254,fans,1.064862
125,complaining,1.073814
67,bit,1.079719
398,kingdoms,1.094171
237,excellent,1.094617
228,enjoying,1.097145
671,solid,1.100561
545,perfect,1.148573
