In [10]:
import json
import pandas as pd
import numpy as np
import scipy as sp
import re
from string import punctuation
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
from nltk.stem.wordnet import WordNetLemmatizer

# Load data

In [2]:
def parse(path):
  g = open(path, 'r',encoding='utf-8')
  for l in g:
    yield eval(l)

In [3]:
def remove_non_ascii(word):
    word = word.lower()
    word = ''.join([i for i in word if i.isalnum() and ord(i) < 128 ])
    return word

In [20]:
def textcleaned(sentence):
    sentence = sentence.lower()
    ps = PorterStemmer()#Add a stemer
    lemmatizer = WordNetLemmatizer()#Add a Lemmatizer

    # split into words
    tokens = word_tokenize(sentence)
    
    # remove punctuation
    table = str.maketrans('', '', punctuation)
    stripped = [w.translate(table) for w in tokens]
    
    # remove other lauguages and some special character
    words = [remove_non_ascii(word) for word in stripped]
    
    # filter out stop words
    stop_words = set(stopwords.words('english'))
    words = [lemmatizer.lemmatize(w) for w in words if not w in stop_words]#Stem words
    
    # stemming of words
#     porter = PorterStemmer()
#     words = [porter.stem(word) for word in words]
    
    return ' '.join(words)

In [21]:
reviews = []
counter = 0
for i in parse("australian_user_reviews.json"):
    dump = json.dumps(i)   
    load = json.loads(dump)
    counter += 1
    if counter%1000 == 0:
        print(counter)
    for i in range(len(load['reviews'])):
        # filter out some non-english reviews and remove stop-words
        load['reviews'][i]['review'] = textcleaned(load['reviews'][i]['review'])
        observation = [load['user_id'],load['reviews'][i]['item_id'],load['reviews'][i]['recommend'],load['reviews'][i]['review']]
        reviews.append(observation)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000


In [163]:
df.to_pickle('review_lemma.pkl')

In [23]:
df = pd.DataFrame(reviews)
df.columns = ['user_id', 'item_id','isRecommend','reviews']
print(df.shape)
df.head(n=20)

(59305, 4)


Unnamed: 0,user_id,item_id,isRecommend,reviews
0,76561197970982479,1250,True,simple yet great replayability opinion zombi...
1,76561197970982479,22200,True,unique worth playthrough
2,76561197970982479,43110,True,great atmosphere gunplay bit chunky time end ...
3,js41637,251610,True,know think see title barbie dreamhouse party ...
4,js41637,227300,True,simple actually simple truck driving simula...
5,js41637,239030,True,fun little game play bored time passer gud r...
6,evcentric,248820,True,suitably punishing roguelike platformer winni...
7,evcentric,370360,True,run fun hell kind fun
8,evcentric,237930,True,elegant integration gameplay story world dev...
9,evcentric,263360,True,random drop random quest stat point animatio...


# NLP Generator

In [152]:

# https://www.cs.uic.edu/~liub/FBS/sentiment-analysis.html#lexicon
# Use the positive word dict and negative word dict in the URL
with open('positive-words.txt') as f:
    pos_dict=f.readlines()
pos_dict=[x[:-1] for x in pos_dict if x[0]!=';']
pos_dict=[x for x in pos_dict if len(x)!=0]
pos_set=set(pos_dict)
pos_set.add('true')# add one more item "true"

with open('negative-words.txt') as f:
    neg_dict=f.readlines()
neg_dict=[x[:-1] for x in neg_dict if x[0]!=';']
neg_dict=[x for x in neg_dict if len(x)!=0]
neg_set=set(neg_dict)

def get_sentiment(review_obj):
    ls=review_obj.split(' ')
    if ls[0]=='true':
#         if the first word is true, which means this item is recommended
#  we record the sentiment to be 1
# Then we count the number of positive words in the review,
# and assign this number to be the positive sentiment
        return sum([1 for x in ls if x in pos_set])
    else:
# do the same for the negative words
        return -sum([1 for x in ls if x in neg_set])
# But denote the negative sentiment as "-" negative numbers

In [158]:
df['Recommend_reviews']=df.isRecommend.astype(str).str.lower()+' '+df.reviews.str.lower()
# Add isRecommend as a string to the review, so if it is true, the sentiment equals 1 by default
# If it is false, the sentiment equals -1 by default


In [159]:
df['sentiment']=df.Recommend_reviews.apply(get_sentiment)
# Generate this sentiment column

# Data Checking

In [160]:
df[df['isRecommend'] ==False]

Unnamed: 0,user_id,item_id,isRecommend,reviews,Recommend_reviews,sentiment
48,76561198043472122,33440,False,game nt work,false game nt work,-1
62,76561198066046412,359320,False,charged 80 15 dollar got boring 5 hour,false charged 80 15 dollar got boring 5 hour,-2
68,76561198070565427,570,False,w,false w,-1
84,boydeer,383080,False,,false,-1
122,sandwiches1,417860,False,emily thot,false emily thot,-1
132,iamthekingofbrowntown,344760,False,spent 3 day making base man teleported base ...,false spent 3 day making base man teleported ...,-3
156,76561198010674657,377160,False,begin trusted bethesda time spent fo3 never...,false begin trusted bethesda time spent fo3 ...,-16
183,Nozomikat,437220,False,harsh new player combat system strange still ...,false harsh new player combat system strange ...,-4
186,xfluttersx,202530,False,got worse sonic 06 played many sonic game ki...,false got worse sonic 06 played many sonic g...,-6
193,ii_voltage_ii,253710,False,game actually free play nt think hunt one an...,false game actually free play nt think hunt ...,-2


In [162]:
df[df['user_id'] == '76561198070565427']

Unnamed: 0,user_id,item_id,isRecommend,reviews,Recommend_reviews,sentiment
65,76561198070565427,218620,True,5 steampayday 25payday2 1 5002 3 4 5 ...,true 5 steampayday 25payday2 1 5002 3 4 ...,1
66,76561198070565427,208090,True,well dont,true well dont,2
67,76561198070565427,242700,True,100100,true 100100,1
68,76561198070565427,570,False,w,false w,-1
69,76561198070565427,224260,True,,true,1
