In [1]:
#Major Imports
import pandas as pd
import csv
import re
import os
import nltk
import InputPipeline
import numpy as np

In [2]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import TweetTokenizer, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import confusion_matrix
from skmultilearn.model_selection.measures import get_combination_wise_output_matrix
from skmultilearn.model_selection import iterative_train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sp_tools import *

In [3]:
#Import Files
source_str = "./cleaned"

In [4]:
#						-- Import Files --
complete_frame = []
review_class = 1
for file in os.listdir(source_str):               #file traversal
    file_name = file.split(".")[0]
    print('Working on {}'.format(file))
    if file_name == "Audio":
        audio_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframe
    elif file_name == "Graphics":
        graphics_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframe
    elif file_name == "Gameplay":
        gameplay_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframe
    else:            
        multi_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframeport_file,ignore_index=True)	

Working on Audio.csv
Working on Gameplay.csv
Working on Graphics.csv
Working on multi - multi(cut).csv


In [5]:
#Drop Invalid Rows
audio_data      = audio_data.drop(audio_data[audio_data['isValid']==0].index)
graphics_data   = graphics_data.drop(graphics_data[graphics_data['isValid']==0].index)
gameplay_data   = gameplay_data.drop(gameplay_data[gameplay_data['isValid']==0].index)
multi_data      = multi_data.drop(multi_data[(multi_data.is_audio== -2) & (multi_data.is_graphics == -2) & (multi_data.is_gameplay == -2)].index)
print(len(audio_data))
print(len(graphics_data))
print(len(gameplay_data))
print(len(multi_data))
print(len(audio_data) + len(graphics_data) + len(gameplay_data) +len(multi_data))

954
454
964
597
2969


In [6]:
#Compiles all data into a single table
input_columns = ['gameId','AccountName','review','classifications','polarity']
input_compilation = pd.DataFrame([],columns=input_columns)
input_compilation = input_compilation.append(pd.DataFrame({"gameId":audio_data['gameId'],
                                        'AccountName':audio_data['AccountName'],
                                        'review':audio_data['review'],
                                        'classifications': [[1,0,0] for i in range(0,audio_data.shape[0])],
                                        'polarity':list([i,-2,-2] for i in audio_data["Polarity"])
                                        }))
print(len(input_compilation))
input_compilation = input_compilation.append(pd.DataFrame({"gameId":graphics_data['gameId'],
                                        'AccountName':graphics_data['AccountName'],
                                        'review':graphics_data['review'],
                                        'classifications': [[0,0,1] for i in range(0,graphics_data.shape[0])],
                                        'polarity':list([-2,-2,i] for i in graphics_data["Polarity"])
                                        }))
print(len(input_compilation))

input_compilation = input_compilation.append(pd.DataFrame({"gameId":gameplay_data['gameId'],
                                        'AccountName':gameplay_data['AccountName'],
                                        'review':gameplay_data['review'],
                                        'classifications': [[0,1,0] for i in range(0,gameplay_data.shape[0])],
                                        'polarity':list([-2,i,-2] for i in gameplay_data["Polarity"])
                                        }))
print(len(input_compilation))


954
1408
2372


In [7]:
md_classification_list = []
md_polarity_list = []

for i in range(0,multi_data.shape[0]):
    md_classification_list.append(classification_list(multi_data.iloc[i]))
    md_polarity_list.append(polarity_list(multi_data.iloc[i]))
# print(md_classification_list)
# print(md_polarity_list)
temp_df = pd.DataFrame({"gameId":multi_data['gameId'],
                                        'AccountName':multi_data['AccountName'],
                                        'review':multi_data['review'],
                                        'classifications': md_classification_list,
                                        'polarity':md_polarity_list
                                        })

print_unique_instances(temp_df['classifications'])
input_compilation = input_compilation.append(temp_df)
input_compilation = input_compilation.reset_index(drop=True)
print()
print_unique_instances(input_compilation['classifications'])
print_len(input_compilation)

('[1, 1, 1]', 159)
('[1, 1, 0]', 142)
('[1, 0, 1]', 122)
('[1, 0, 0]', 85)
('[0, 1, 1]', 53)
('[0, 0, 1]', 21)
('[0, 1, 0]', 15)

('[1, 0, 0]', 1039)
('[0, 1, 0]', 979)
('[0, 0, 1]', 475)
('[1, 1, 1]', 159)
('[1, 1, 0]', 142)
('[1, 0, 1]', 122)
('[0, 1, 1]', 53)
2969


In [8]:
print(len(input_compilation))

2969


In [9]:
input_compilation = input_compilation.reset_index(drop=True)
print(input_compilation.head())

   gameId   AccountName                                             review  \
0  294100           Jak                 good music , when there is music .   
1  219740     Irrapture  however , lumine is staggeringly ugly and has ...   
2  238460  REXPOWERCOLT  the soundtrack is amazing , arguably the best ...   
3  253230      DeathCap  you will get engrossed in the 80s miami atmosp...   
4  219740   Ƥyrefeather  one of the first things you can do is go up to...   

  classifications      polarity  
0       [1, 0, 0]   [1, -2, -2]  
1       [1, 0, 0]  [-1, -2, -2]  
2       [1, 0, 0]   [1, -2, -2]  
3       [1, 0, 0]   [1, -2, -2]  
4       [1, 0, 0]   [0, -2, -2]  


In [10]:
#Convert Output ["Audio","Graphics","Gameplay"] into an array

review_output = list([list(i),j] for i,j in zip(input_compilation['classifications'],input_compilation['polarity']))
# print(review_output)



In [11]:
#Separate the reviews with unique polarity combination
y = pd.DataFrame({
                "review":
                            input_compilation['review'],
                "classifications":
                          [str(i) for i in input_compilation['classifications']],
                "polarity":
                          [str(i) for i in input_compilation['polarity']]
                           })

solos_list = ['[0, -1, -2]','[-1, 0, -2]','[1, 0, 0]','[0, 0, 1]','[-1, 1, 1]','[-1, 0, 1]','[-1, -2, 1]']
solo_reviews = y.loc[y['polarity'].isin(solos_list)]
leftover_reviews = y.loc[~y['polarity'].isin(solos_list)]
trimmed_review_list = leftover_reviews['review'].reset_index(drop=True)
trimmed_output_list = leftover_reviews.drop(columns=['review']).reset_index(drop=True)


# print(trimmed_review_list)
print_len(y,trimmed_review_list,trimmed_output_list,solo_reviews)



2969
2962
2962
7


In [12]:
#Train/Test Split for the reviews with polarity combinations that occurs more than once
temp_x = trimmed_review_list
temp_y = trimmed_output_list 
data_train, data_test, label_train, label_test = train_test_split(temp_x,temp_y,test_size=0.3,stratify=temp_y['polarity'])


print(label_train)

     classifications     polarity
1609       [0, 1, 0]  [-2, 0, -2]
2800       [1, 1, 1]    [1, 1, 1]
846        [1, 0, 0]  [1, -2, -2]
2300       [0, 1, 0]  [-2, 0, -2]
1680       [0, 1, 0]  [-2, 1, -2]
...              ...          ...
2288       [0, 1, 0]  [-2, 0, -2]
1124       [0, 0, 1]  [-2, -2, 1]
1430       [0, 1, 0]  [-2, 0, -2]
1084       [0, 0, 1]  [-2, -2, 1]
137        [1, 0, 0]  [1, -2, -2]

[2073 rows x 2 columns]


In [13]:
print_unique_instances(label_train['polarity'])

('[1, -2, -2]', 623)
('[-2, 1, -2]', 383)
('[-2, -2, 1]', 266)
('[-2, 0, -2]', 246)
('[1, 1, 1]', 84)
('[0, -2, -2]', 80)
('[1, 1, -2]', 66)
('[1, -2, 1]', 66)
('[-2, -1, -2]', 57)
('[-2, -2, 0]', 38)
('[-2, -2, -1]', 28)
('[-1, -2, -2]', 24)
('[1, 0, -2]', 22)
('[-2, 1, 1]', 22)
('[1, 0, 1]', 11)
('[0, -2, 0]', 10)
('[0, 1, -2]', 4)
('[1, 1, 0]', 4)
('[-2, 0, 0]', 4)
('[1, -1, 1]', 4)
('[1, -2, 0]', 4)
('[-1, -2, -1]', 4)
('[-2, -1, -1]', 3)
('[0, 0, 0]', 3)
('[0, 0, -2]', 3)
('[-1, -1, -2]', 2)
('[0, -2, 1]', 2)
('[-2, 0, 1]', 2)
('[-2, -1, 1]', 2)
('[-2, 1, -1]', 2)
('[1, 1, -1]', 1)
('[-2, 0, -1]', 1)
('[-2, 1, 0]', 1)
('[1, -1, -2]', 1)


In [14]:
print_unique_instances(label_test['polarity'])

('[1, -2, -2]', 267)
('[-2, 1, -2]', 164)
('[-2, -2, 1]', 114)
('[-2, 0, -2]', 105)
('[1, 1, 1]', 36)
('[0, -2, -2]', 34)
('[1, -2, 1]', 29)
('[1, 1, -2]', 28)
('[-2, -1, -2]', 24)
('[-2, -2, 0]', 17)
('[-2, -2, -1]', 12)
('[-1, -2, -2]', 11)
('[-2, 1, 1]', 9)
('[1, 0, -2]', 9)
('[1, 0, 1]', 5)
('[0, -2, 0]', 4)
('[1, 1, 0]', 2)
('[0, 0, 0]', 2)
('[0, 0, -2]', 2)
('[-1, -1, -2]', 1)
('[1, -1, 1]', 1)
('[0, 1, -2]', 1)
('[-2, 0, 0]', 1)
('[0, -2, 1]', 1)
('[1, -2, 0]', 1)
('[1, -1, -2]', 1)
('[1, 1, -1]', 1)
('[-2, 1, -1]', 1)
('[-2, 0, -1]', 1)
('[-2, 1, 0]', 1)
('[-1, -2, -1]', 1)
('[-2, 0, 1]', 1)
('[-2, -1, 1]', 1)
('[-2, -1, -1]', 1)


In [15]:
#Split the reviews with unique polarity combination
temp_x = solo_reviews['review']
temp_y = solo_reviews.drop(columns=['review'])
data_train_2, data_test_2, label_train_2, label_test_2 = train_test_split(temp_x,temp_y,test_size=.3)

print_len(data_train_2,label_train_2,data_test_2,label_test_2)


4
4
3
3


In [16]:
#Join the 2 training data
print_len(data_train,data_train_2)
temp_x = data_train.append(data_train_2)
data_train = temp_x
print_len(temp_x)

2073
4
2077


In [17]:
#Join the 2 training data labels
print_len(label_train,label_train_2)
temp_y = label_train.append(label_train_2)
label_train = temp_y
print_len(label_train)

2073
4
2077


In [18]:
#Join the 2 testing data
print_len(data_test,data_test_2)
temp_x = data_test.append(data_test_2)
data_test = temp_x
print_len(temp_x)

889
3
892


In [19]:
#Join the 2 testing data labels
print_len(label_test,label_test_2)
temp_y = label_test.append(label_test_2)
label_test = temp_y
print_len(label_test)

889
3
892


In [20]:
#Convert labels from stringified list back to list

print_len(label_train,label_test)
label_train['classifications'] = [eval(temp_list) for temp_list in label_train['classifications']]
label_train['polarity'] = [eval(temp_list) for temp_list in label_train['polarity']]

print_len(label_train,label_test)
label_test['classifications'] = [eval(temp_list) for temp_list in label_test['classifications']]
label_test['polarity'] = [eval(temp_list) for temp_list in label_test['polarity']]

2077
892
2077
892


In [21]:
#Reset indeces
data_train = data_train.reset_index(drop=True)
data_test  = data_test.reset_index(drop=True)
label_train = label_train.reset_index(drop=True)
label_test  = label_test.reset_index(drop=True)


In [22]:
#Vectorizer
category_tf = TfidfVectorizer(ngram_range=(1,1))
category_tf.fit(data_train)

data_train_tf	= category_tf.transform(data_train)
data_test_tf	= category_tf.transform(data_test)

In [23]:
temp_x = np.array([np.array(i) for i in label_train['classifications']])
temp_y = np.array([np.array(i) for i in label_test['classifications']])
# label_train['classifications'] = temp_x
# label_test['classifications'] = temp_y
# print(temp_x)
# print(temp_y)
# print(label_train)
# print(label_test)
print(type(temp_x))
print(type(label_train['classifications']))
for i in temp_x: print(np.array(i))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
[0 1 0]
[1 1 1]
[1 0 0]
[0 1 0]
[0 1 0]
[1 1 0]
[0 0 1]
[1 0 0]
[1 0 0]
[1 0 0]
[0 0 1]
[1 0 0]
[1 0 0]
[1 0 0]
[1 1 1]
[0 0 1]
[0 1 0]
[1 0 0]
[1 1 1]
[0 1 0]
[1 1 0]
[0 1 0]
[1 0 0]
[1 0 0]
[1 0 0]
[1 1 1]
[0 1 0]
[1 0 0]
[1 1 0]
[0 1 0]
[0 0 1]
[1 1 0]
[1 0 0]
[1 0 0]
[1 0 0]
[1 1 0]
[1 0 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[0 0 1]
[1 0 1]
[0 1 0]
[0 1 0]
[1 0 0]
[0 1 0]
[1 0 0]
[1 0 0]
[1 0 1]
[0 1 0]
[0 0 1]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[1 1 0]
[0 0 1]
[0 0 1]
[0 1 0]
[1 0 1]
[0 1 0]
[1 1 1]
[1 1 0]
[0 1 0]
[0 0 1]
[0 0 1]
[0 1 0]
[1 0 0]
[0 0 1]
[0 1 0]
[0 1 0]
[0 1 0]
[0 1 0]
[1 0 1]
[1 0 0]
[1 0 1]
[1 1 0]
[1 0 0]
[1 0 0]
[0 1 0]
[0 0 1]
[0 0 1]
[0 1 0]
[0 1 0]
[1 1 0]
[1 1 1]
[0 1 0]
[1 0 1]
[0 0 1]
[0 1 0]
[1 0 0]
[0 1 0]
[1 0 0]
[0 1 1]
[0 1 0]
[0 0 1]
[1 0 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 1 1]
[0 1 0]
[1 1 0]
[1 0 0]
[1 0 0]
[1 0 0]
[0 1 0]
[1 0 0]
[0 1 0]
[1 0 0]
[0 1 0]
[1 0 0]
[1 0 0]
[1 1 0]
[1 0 0]
[0 1 0]
[1 0

[0 1 0]
[0 1 0]
[1 1 1]
[0 0 1]
[0 1 0]
[0 1 0]
[0 1 0]
[1 1 1]
[0 0 1]
[0 1 0]
[0 0 1]
[0 1 0]
[1 0 1]
[1 0 0]
[1 0 1]
[0 1 0]
[1 0 0]
[1 0 0]
[0 0 1]
[1 0 0]
[1 1 1]
[1 0 0]
[1 0 0]
[1 0 0]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[1 0 0]
[1 0 0]
[1 0 0]
[1 0 0]
[0 1 0]
[1 0 0]
[1 0 0]
[0 1 0]
[0 0 1]
[0 0 1]
[0 0 1]
[0 1 0]
[1 1 1]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 0]
[0 0 1]
[1 0 1]
[1 0 0]
[0 1 0]
[1 1 0]
[1 0 1]
[1 1 0]
[0 0 1]
[1 0 0]
[1 0 0]
[1 0 0]
[1 0 0]
[1 0 1]
[1 1 1]
[0 0 1]
[1 0 0]
[1 1 1]
[1 0 0]
[1 0 0]
[0 0 1]
[1 0 1]
[1 0 0]
[0 0 1]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[0 1 0]
[1 0 0]
[0 1 0]
[0 0 1]
[0 1 0]
[0 1 0]
[0 1 0]
[1 0 0]
[1 0 1]
[0 1 0]
[0 1 0]
[1 0 0]
[0 1 0]
[1 0 0]
[0 0 1]
[0 1 0]
[0 0 1]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[1 0 0]
[1 0 1]
[1 1 1]
[1 0 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 0]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 0]
[1 0 0]
[1 0 0]
[1 0 0]
[0 1 0]
[0 1 1]
[0 0 1]
[0 0 1]
[1 0 0]
[0 1 0]
[1 0 0]
[0 1 0]
[0 1 1]
[1 0 0]
[0 1 0]
[0 0 1]
[1 0 0]


In [24]:
# for i in range(0,2): print(data_train_tf[i])

# Category Classifier

In [25]:
#Initialize Clasifier Model
category_lsvc = OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()))

In [26]:
print(label_train)

     classifications     polarity
0          [0, 1, 0]  [-2, 0, -2]
1          [1, 1, 1]    [1, 1, 1]
2          [1, 0, 0]  [1, -2, -2]
3          [0, 1, 0]  [-2, 0, -2]
4          [0, 1, 0]  [-2, 1, -2]
...              ...          ...
2072       [1, 0, 0]  [1, -2, -2]
2073       [1, 1, 0]  [0, -1, -2]
2074       [1, 1, 1]    [1, 0, 0]
2075       [1, 1, 0]  [-1, 0, -2]
2076       [1, 1, 1]   [-1, 1, 1]

[2077 rows x 2 columns]


In [27]:
#Train Model
category_lsvc.fit(data_train_tf,temp_x)

OneVsRestClassifier(estimator=CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0,
                                                                              class_weight=None,
                                                                              dual=True,
                                                                              fit_intercept=True,
                                                                              intercept_scaling=1,
                                                                              loss='squared_hinge',
                                                                              max_iter=1000,
                                                                              multi_class='ovr',
                                                                              penalty='l2',
                                                                              random_state=None,
                                                     

In [28]:
#Test Model
category_prediction = category_lsvc.predict(data_test_tf)
category_probability = category_lsvc.predict_proba(data_test_tf)

In [29]:
print_testing(temp_y,category_probability)
# print(label_test)
print_len(data_train,data_test)

Treshold: 	0.3
Accuracy: 	0.5661
Precision:	 0.7315384615384616
Hamming Loss:	 0.1805

Sub Accuracy
Audio    	 Accuracy: 0.8722 	Precision: 0.8247 	 Recall: 0.9409
Gameplay 	 Accuracy: 0.7511 	Precision: 0.6745 	 Recall: 0.86
Graphics 	 Accuracy: 0.8352 	Precision: 0.6701 	 Recall: 0.7878

Treshold: 	0.4
Accuracy: 	0.6166
Precision:	 0.7735682819383259
Hamming Loss:	 0.1734

Sub Accuracy
Audio    	 Accuracy: 0.87 	Precision: 0.8506 	 Recall: 0.8932
Gameplay 	 Accuracy: 0.778 	Precision: 0.7428 	 Recall: 0.7725
Graphics 	 Accuracy: 0.8318 	Precision: 0.6848 	 Recall: 0.7184

Treshold: 	0.5
Accuracy: 	0.6222
Precision:	 0.7966601178781926
Hamming Loss:	 0.1797

Sub Accuracy
Audio    	 Accuracy: 0.8576 	Precision: 0.8565 	 Recall: 0.8545
Gameplay 	 Accuracy: 0.7948 	Precision: 0.7893 	 Recall: 0.74
Graphics 	 Accuracy: 0.8083 	Precision: 0.6814 	 Recall: 0.5673

Treshold: 	0.6
Accuracy: 	0.6043
Precision:	 0.8226544622425629
Hamming Loss:	 0.1947

Sub Accuracy
Audio    	 Accuracy: 0.8341 