In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import string
import re

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['Devotion_Reviews.csv']


In [2]:
df = pd.read_csv('../input/Devotion_Reviews.csv')
df_review = df[['text', 'recommended']].copy()
df_review['recommended'] = df['recommended'].astype(dtype=np.int64)
df_review.head()

Unnamed: 0,text,recommended
0,Chinese people didn't like it 'cuz this game p...,1
1,I don't recommend this game. I don't care abou...,0
2,Deep describing of native Taiwan culture of 19...,1
3,Well at the risk of this review getting buried...,1
4,It's not a political satire nor a boring propa...,1


In [3]:
# remove Chinese character
printable = set(string.printable)
df_review['cleantext'] = df['text'].apply(lambda row: ''.join(filter(lambda x:x in printable,row)))

REPLACE = re.compile('[.;:!\'?,\"()\[\]]')
def pre_process(text):
    # lowercase
    text = text.lower()
    # tags
    text = re.sub('&lt;/?.*?&gt;',' &lt;&gt; ',text)
    # special characters and digits
    text=re.sub('(\\d|\\W)+',' ',text)
    # remove punctuation
    #text = re.sub('[.;:!\'?,\"()\[\]]', '', text)
    #text = [REPLACE.sub('', line) for line in text]
    
    return text

df_review['cleantext'] = df_review['cleantext'].apply(lambda x:pre_process(x))




Remove Stopwords

In [4]:
from nltk.corpus import stopwords
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS

#english_stop_words = stopwords.words('english')
english_stop_words = ENGLISH_STOP_WORDS
def remove_stop_words(corpus):
    removed_stop_words = []
    for review in corpus:
        removed_stop_words.append(
            ' '.join([word for word in review.split() 
                      if word not in english_stop_words])
        )
    return removed_stop_words

df_review['cleantext'] = remove_stop_words(df_review['cleantext'])

In [5]:
print(df_review['text'][10])
print(df_review['cleantext'][10])

This story is purely fictional. Any resemblance to actual individuals or events are coincidental.\n本故事纯属虚构，如有雷同，纯属虚构
story purely fictional resemblance actual individuals events coincidental n


## Normalization

    Stemming
    Lemmatization

In [6]:
# Stemming
from nltk.stem.porter import PorterStemmer

def get_stemmed_text(corpus):
    stemmer = PorterStemmer()
    return [' '.join([stemmer.stem(word) for word in review.split()]) for review in corpus]

df_review['stemmedtext'] = get_stemmed_text(df_review['cleantext'])

In [7]:
df_review.head()

Unnamed: 0,text,recommended,cleantext,stemmedtext
0,Chinese people didn't like it 'cuz this game p...,1,chinese people didn t like cuz game practices ...,chines peopl didn t like cuz game practic free...
1,I don't recommend this game. I don't care abou...,0,don t recommend game don t care political hide...,don t recommend game don t care polit hide ins...
2,Deep describing of native Taiwan culture of 19...,1,deep describing native taiwan culture s atomsp...,deep describ nativ taiwan cultur s atomspher g...
3,Well at the risk of this review getting buried...,1,risk review getting buried review bomb controv...,risk review get buri review bomb controversi s...
4,It's not a political satire nor a boring propa...,1,s political satire boring propaganda redcandle...,s polit satir bore propaganda redcandlegam mer...


In [8]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
def get_lemmatized_text(corpus):
    lemmatizer = WordNetLemmatizer()
    return [' '.join([lemmatizer.lemmatize(word) for word in review.split()]) for review in corpus]

df_review['lemmatext'] = get_lemmatized_text(df_review['stemmedtext'])

## N-gram

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

ngram_vectorizer = CountVectorizer(binary=True, ngram_range=(1,2))
ngram_vectorizer.fit(df_review['lemmatext'])
X = ngram_vectorizer.transform(df_review['lemmatext'])
y = df_review['recommended']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print('Accuracy for C=%s: %s' % (c, accuracy_score(y_test, lr.predict(X_test))))
    
final_ngram = LogisticRegression(C=1)
final_ngram.fit(X, y)
print('Final Accuracy: %s' % accuracy_score(y_test, final_ngram.predict(X_test)))

Accuracy for C=0.01: 0.7701863354037267
Accuracy for C=0.05: 0.8074534161490683
Accuracy for C=0.25: 0.8385093167701864
Accuracy for C=0.5: 0.8385093167701864
Accuracy for C=1: 0.8509316770186336
Final Accuracy: 0.9440993788819876




In [10]:
feature_to_coef = {
    word: coef for word, coef in zip(
     ngram_vectorizer.get_feature_names(), final_ngram.coef_[0])
}

print('Positive Words')
for best_positive in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1],
    reverse=True)[:10]:
    print(best_positive)
    
print('Negative Words')
for best_negative in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1])[:10]:
    print(best_negative)

Positive Words
('review', 0.8264656210212834)
('stori', 0.7886664567798508)
('horror', 0.7805708204233848)
('great', 0.7245683530987677)
('amaz', 0.7241240985033233)
('excel', 0.70819295338569)
('worth', 0.6476257573556538)
('love', 0.6318299719287049)
('horror game', 0.6109478085196917)
('awesom', 0.6047624164808856)
Negative Words
('polit', -1.909847954026398)
('disgust', -1.1885759098177382)
('independ', -0.9020287921591348)
('bad', -0.7596159973042264)
('element', -0.7113186538282983)
('insult', -0.6343253565733883)
('refund', -0.6107995496153661)
('redcandl', -0.6054410207477393)
('lol', -0.5442279839856778)
('polit game', -0.5419982844416432)


## TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
tfidf_vectorizer.fit(df_review['lemmatext'])
X = tfidf_vectorizer.transform(df_review['lemmatext'])
y = df_review['recommended']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print('Accuracy for C=%s: %s' %(c, accuracy_score(y_test, lr.predict(X_test))))
    
final_tfidf = LogisticRegression(C=1)
final_tfidf.fit(X, y)
accuracy_score(y, final_tfidf.predict(X))

Accuracy for C=0.01: 0.7639751552795031
Accuracy for C=0.05: 0.7639751552795031
Accuracy for C=0.25: 0.7639751552795031
Accuracy for C=0.5: 0.7639751552795031
Accuracy for C=1: 0.7639751552795031




0.8214285714285714

In [12]:
feature_to_coef = {
    word: coef for word, coef in zip(
     tfidf_vectorizer.get_feature_names(), final_tfidf.coef_[0])
}

print('Positive Words')
for best_positive in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1],
    reverse=True)[:10]:
    print(best_positive)
    
print('Negative Words')
for best_negative in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1])[:10]:
    print(best_negative)

Positive Words
('stori', 1.1892518051403431)
('great', 1.1639163578281155)
('horror', 1.1217466143534138)
('review', 0.9653257670823109)
('horror game', 0.8495859060759398)
('atmospher', 0.7613132486208781)
('taiwanes', 0.7351852542327205)
('amaz', 0.7312840145267567)
('play', 0.706856946288395)
('excel', 0.6983593822540749)
Negative Words
('polit', -2.86757270311894)
('disgust', -1.453658529178925)
('metaphor', -1.1799371920547714)
('polit metaphor', -1.0638550712529116)
('game polit', -0.9136958953481064)
('independ', -0.8961171482741773)
('insult', -0.885972911274552)
('bad', -0.7206789385921515)
('polit game', -0.7161830954859586)
('element', -0.7110329739717606)


## Problems:

1. find the stopwords from nltk library also includes the negative words, like "didnt'". The sentence:"I didn't like the game because..." becomes " like game", which is a bad lead to the result. Therefore, I used stopwords in sklearn library instead. It's better to create own stopwords list to ensure better performance.

In [13]:
'dont like' in feature_to_coef

True

In [14]:
feature_to_coef['dont like']

0.04349492386885162

## What if we only consider paired words(2-gram)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tfidf_vectorizer = TfidfVectorizer(ngram_range=(2,2))
tfidf_vectorizer.fit(df_review['lemmatext'])
X = tfidf_vectorizer.transform(df_review['lemmatext'])
y = df_review['recommended']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.75)

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print('Accuracy for C=%s: %s' %(c, accuracy_score(y_test, lr.predict(X_test))))
    
final_tfidf = LogisticRegression(C=1)
final_tfidf.fit(X, y)
accuracy_score(y, final_tfidf.predict(X))

Accuracy for C=0.01: 0.6956521739130435
Accuracy for C=0.05: 0.6956521739130435
Accuracy for C=0.25: 0.6956521739130435
Accuracy for C=0.5: 0.6956521739130435
Accuracy for C=1: 0.6956521739130435




0.7857142857142857

In [16]:
feature_to_coef = {
    word: coef for word, coef in zip(
     tfidf_vectorizer.get_feature_names(), final_tfidf.coef_[0])
}

print('Positive Words')
for best_positive in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1],
    reverse=True)[:10]:
    print(best_positive)
    
print('Negative Words')
for best_negative in sorted(
    feature_to_coef.items(),
    key=lambda x: x[1])[:10]:
    print(best_negative)

Positive Words
('horror game', 1.314331181087135)
('great game', 0.7780971240659414)
('review bomb', 0.7662009030301936)
('game great', 0.5350446763491172)
('highli recommend', 0.5211051189690845)
('winni pooh', 0.492267270586547)
('ve play', 0.4439030471822806)
('neg review', 0.4420815839101605)
('nice game', 0.438237996338948)
('play game', 0.4251074451687537)
Negative Words
('polit metaphor', -1.9481063822950957)
('game polit', -1.4166645708629457)
('polit game', -1.082228225285144)
('independ taiwan', -0.7828908737940444)
('taiwan independ', -0.6516963999808965)
('metaphor game', -0.6512158178412064)
('absolut blind', -0.6355664489344535)
('close detent', -0.6355664489344535)
('develop disgust', -0.6355664489344535)
('hate crap', -0.6355664489344535)


Less accuracy but can reflect more on why people hate the game, which is the political metaphors hidden in the game. I'll talk more about it in the next article.