In [1]:
#Major Imports
import pandas as pd
import csv
import re
import os
import nltk
import InputPipeline
import numpy as np

In [2]:
from nltk.stem import PorterStemmer 
from nltk.tokenize import TweetTokenizer, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import hamming_loss
from sklearn.metrics import confusion_matrix
from sp_tools import print_unique_instances

In [3]:
#Import Files
source_str = "./cleaned"

In [4]:
#						-- Import Files --
complete_frame = []
review_class = 1
for file in os.listdir(source_str):               #file traversal
    file_name = file.split(".")[0]
    print('Working on {}'.format(file))
    if file_name == "Audio":
        audio_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframe
    elif file_name == "Graphics":
        graphics_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframe
    elif file_name == "Gameplay":
        gameplay_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframe
    else:            
        multi_data = pd.read_csv('{}/{}'.format(source_str,file),index_col=None)				#Load original dataframeport_file,ignore_index=True)	

Working on Audio.csv
Working on Gameplay.csv
Working on Graphics.csv
Working on multi - multi(cut).csv


In [5]:
#Drop Invalid Rows
audio_data      = audio_data.drop(audio_data[audio_data['isValid']==0].index)
graphics_data   = graphics_data.drop(graphics_data[graphics_data['isValid']==0].index)
gameplay_data   = gameplay_data.drop(gameplay_data[gameplay_data['isValid']==0].index)
multi_data      = multi_data.drop(multi_data[(multi_data.is_audio== -2) & (multi_data.is_graphics == -2) & (multi_data.is_gameplay == -2)].index)
print(len(audio_data))
print(len(graphics_data))
print(len(gameplay_data))
print(len(multi_data))
print(len(audio_data) + len(graphics_data) + len(gameplay_data) +len(multi_data))

954
454
964
597
2969


In [6]:
#used for the cell below
def classification_list(mrow):
    output_list = []
    if mrow['is_audio'] == 1:
        output_list.append('audio')
    if mrow['is_graphics'] == 1:
        output_list.append('graphics')
    if mrow['is_gameplay'] == 1:
        output_list.append('gameplay')
    if len(output_list) == 0:
        print(mrow)
    return output_list
def polarity_list(mrow):
    output_list = [-2,-2,-2]
    # print(mrow)
    if mrow['is_audio'] == 1:
        output_list[0] = mrow['audio_polarity']
    if mrow['is_gameplay'] == 1:
        output_list[1] = mrow['gameplay_polarity']
    if mrow['is_graphics'] == 1:
        output_list[2] = mrow['graphics_polarity']
    # print(output_list)
    return output_list

In [7]:
#Compiles all data into a single table
input_columns = ['gameId','AccountName','review','classifications','polarity']
input_compilation = pd.DataFrame([],columns=input_columns)
input_compilation = input_compilation.append(pd.DataFrame({"gameId":audio_data['gameId'],
                                        'AccountName':audio_data['AccountName'],
                                        'review':audio_data['review'],
                                        'classifications': [['audio'] for i in range(0,audio_data.shape[0])],
                                        'polarity':list([i,-2,-2] for i in audio_data["Polarity"])
                                        }))
input_compilation = input_compilation.append(pd.DataFrame({"gameId":graphics_data['gameId'],
                                        'AccountName':graphics_data['AccountName'],
                                        'review':graphics_data['review'],
                                        'classifications': [['graphics'] for i in range(0,graphics_data.shape[0])],
                                        'polarity':list([-2,-2,i] for i in graphics_data["Polarity"])
                                        }))
input_compilation = input_compilation.append(pd.DataFrame({"gameId":gameplay_data['gameId'],
                                        'AccountName':gameplay_data['AccountName'],
                                        'review':gameplay_data['review'],
                                        'classifications': [['gameplay'] for i in range(0,gameplay_data.shape[0])],
                                        'polarity':list([-2,i,-2] for i in gameplay_data["Polarity"])
                                        }))
print(len(input_compilation))

2372


In [8]:
md_classification_list = []
md_polarity_list = []

for i in range(0,multi_data.shape[0]):
    md_classification_list.append(classification_list(multi_data.iloc[i]))
    md_polarity_list.append(polarity_list(multi_data.iloc[i]))
# print(md_classification_list)
# print(md_polarity_list)
temp_df = pd.DataFrame({"gameId":multi_data['gameId'],
                                        'AccountName':multi_data['AccountName'],
                                        'review':multi_data['review'],
                                        'classifications': md_classification_list,
                                        'polarity':md_polarity_list
                                        })

print_unique_instances(temp_df['classifications'])
input_compilation = input_compilation.append(temp_df)
input_compilation = input_compilation.reset_index(drop=True)
print()
print_unique_instances(input_compilation['classifications'])
print(len(input_compilation))

("['audio', 'graphics', 'gameplay']", 159)
("['audio', 'gameplay']", 142)
("['audio', 'graphics']", 122)
("['audio']", 85)
("['graphics', 'gameplay']", 53)
("['graphics']", 21)
("['gameplay']", 15)

("['audio']", 1039)
("['gameplay']", 979)
("['graphics']", 475)
("['audio', 'graphics', 'gameplay']", 159)
("['audio', 'gameplay']", 142)
("['audio', 'graphics']", 122)
("['graphics', 'gameplay']", 53)
2969


In [9]:
print(input_compilation)

      gameId    AccountName  \
0     294100            Jak   
1     219740      Irrapture   
2     238460   REXPOWERCOLT   
3     253230       DeathCap   
4     219740    Æ¤yrefeather   
...      ...            ...   
2964  238460         Fraise   
2965  238460  frisbeemenace   
2966  253230       genkipro   
2967  219740     ghostspawn   
2968  294100     ghostspawn   

                                                 review       classifications  \
0                    good music , when there is music .               [audio]   
1     however , lumine is staggeringly ugly and has ...               [audio]   
2     the soundtrack is amazing , arguably the best ...               [audio]   
3     you will get engrossed in the 80s miami atmosp...               [audio]   
4     one of the first things you can do is go up to...               [audio]   
...                                                 ...                   ...   
2964             The music and visuals make this game .    

In [10]:
input_compilation = input_compilation.reset_index()
input_compilation = input_compilation.drop(columns=['index'])

In [11]:
#Convert Output ["Audio","Graphics","Gameplay"] into an array
multilabel_binarizer = MultiLabelBinarizer()
multilabel_binarizer.fit(input_compilation.classifications)
review_output = multilabel_binarizer.transform(input_compilation.classifications)
print(type(review_output[0]))
review_output = list([i,j] for i,j in zip(review_output,input_compilation['polarity']))


<class 'numpy.ndarray'>


In [12]:
#Left is category, Right is Polarity
for i in range(0,5): print (review_output[i])


[array([1, 0, 0]), [1, -2, -2]]
[array([1, 0, 0]), [-1, -2, -2]]
[array([1, 0, 0]), [1, -2, -2]]
[array([1, 0, 0]), [1, -2, -2]]
[array([1, 0, 0]), [0, -2, -2]]


In [13]:
print(multilabel_binarizer.classes_)

['audio' 'gameplay' 'graphics']


In [14]:
review_list = input_compilation['review'].tolist()

In [15]:
for i in range(0,5): print(review_list[i])

good music , when there is music .
however , lumine is staggeringly ugly and has an annoying voice .
the soundtrack is amazing , arguably the best soundtrack of any game i ve played .
you will get engrossed in the 80s miami atmosphere with the game s stellar soundtrack and spectacular art direction .
one of the first things you can do is go up to a tv with a game playing on it , and when you get near it the background music changes to have a dog barking the theme song in a neat little remix .


In [16]:
#Initialize Tokenizer
tokenizer_words = TweetTokenizer()
stemmer = PorterStemmer()
timer = 0
stemmed_review_list = []

In [17]:
#Stemming part was removed since it gave lower accuracy
for sentence in review_list:
    stemmed_review_list.append(sentence.lower())          #For no stemming comment everything above 


In [18]:
for i in range(0,5): print(stemmed_review_list[i])

good music , when there is music .
however , lumine is staggeringly ugly and has an annoying voice .
the soundtrack is amazing , arguably the best soundtrack of any game i ve played .
you will get engrossed in the 80s miami atmosphere with the game s stellar soundtrack and spectacular art direction .
one of the first things you can do is go up to a tv with a game playing on it , and when you get near it the background music changes to have a dog barking the theme song in a neat little remix .


In [19]:
#Train/Test Split
data_train, data_test, label_train, label_test = train_test_split(stemmed_review_list, review_output, test_size = 0.3, random_state = 7)

In [20]:
#Vectorizer
category_tf = TfidfVectorizer(ngram_range=(1,1))
category_tf.fit(data_train)

data_train_tf	= category_tf.transform(data_train)
data_test_tf	= category_tf.transform(data_test)

In [21]:
for i in range(0,2): print(data_train_tf[i])

  (0, 5036)	0.12832798095728912
  (0, 4967)	0.1300956430272045
  (0, 4588)	0.2473611169483468
  (0, 4490)	0.07477701599048939
  (0, 4320)	0.268434476231889
  (0, 4114)	0.19604960865658355
  (0, 4016)	0.2455535885301031
  (0, 3461)	0.4308655703455767
  (0, 3053)	0.18628301819133586
  (0, 2851)	0.29613186618295445
  (0, 1989)	0.1560253709083546
  (0, 1910)	0.2107049750174494
  (0, 1512)	0.21929736863548027
  (0, 1207)	0.34360050886710847
  (0, 704)	0.3629397208181735
  (0, 654)	0.16065027662843775
  (0, 313)	0.15051763010223707
  (0, 253)	0.07691426560177803
  (0, 4490)	0.22297527577721402
  (0, 4016)	0.36610433296474426
  (0, 2790)	0.32534117902221965
  (0, 1634)	0.5683196937184768
  (0, 749)	0.3024862337311738
  (0, 313)	0.4488238750480173
  (0, 253)	0.11467413721042026
  (0, 229)	0.28517457212335917


# Category Classifier

In [22]:
#Initialize Clasifier Model
category_lsvc = OneVsRestClassifier(CalibratedClassifierCV(LinearSVC()))

In [23]:
#Train Model
category_lsvc.fit(data_train_tf,list(i[0] for i in label_train))

OneVsRestClassifier(estimator=CalibratedClassifierCV(base_estimator=LinearSVC(C=1.0,
                                                                              class_weight=None,
                                                                              dual=True,
                                                                              fit_intercept=True,
                                                                              intercept_scaling=1,
                                                                              loss='squared_hinge',
                                                                              max_iter=1000,
                                                                              multi_class='ovr',
                                                                              penalty='l2',
                                                                              random_state=None,
                                                     

In [24]:
#Test Model
category_prediction = category_lsvc.predict(data_test_tf)
category_probability = category_lsvc.predict_proba(data_test_tf)

In [25]:
#Functions for cell below
def is_over_threshold(threshold,input_list):
    label_types = np.array([0,0,0])
    if input_list[0] > threshold:
        label_types[0] = 1      
    if input_list[1] > threshold:
        label_types[1] = 1
    if input_list[2] > threshold:    
       label_types[2] = 1
    return label_types
def compute_sub_accuracy(label,output):
    test_list = np.hsplit(label,3)
    output_list = np.hsplit(np.array(output),3)
    print()
    print("Sub Accuracy")
    for i in range(0,3):
        x_list = test_list[i]
        y_list = output_list[i]
        tn, fp, fn, tp = confusion_matrix(x_list,y_list).ravel()
        accuracy  = (tp + tn)/ (tp+tn+fp+fn)
        precision = (tp) / (tp + fp)
        recall    = (tp) / (tp + fn)        
        if (i+1) == 1: 
            print("Audio    \t Accuracy: {} \tPrecision: {} \t Recall: {}".format(round(accuracy,4),round(precision,4),round(recall,4)))
        elif (i+1) == 2:
            print("Gameplay \t Accuracy: {} \tPrecision: {} \t Recall: {}".format(round(accuracy,4),round(precision,4),round(recall,4)))
        elif (i+1) == 3:
            print("Graphics \t Accuracy: {} \tPrecision: {} \t Recall: {}".format(round(accuracy,4),round(precision,4),round(recall,4)))


def print_testing():
    category_label_test = np.array(list(i[0] for i in label_test))
    threshold_list = [.3,.4,.5,.6,.7]
    for temp_treshold in threshold_list:
        output_labels = []
        for i in category_probability: 
            output_labels.append(is_over_threshold(temp_treshold,i))
        
        # print("-------------------------")
        print("Treshold: \t{}".format(temp_treshold))
        print("Accuracy: \t{}".format(round(accuracy_score(category_label_test,output_labels),4)))
        print("Precision:\t {}".format(precision_score(category_label_test,output_labels,average="micro"),4))
        ## micro = global_tp/(global_tp+global_fp)
        ## macro = ave(audio_tp/(audio_tp/audio_fp)+ ... +...)
        print("Hamming Loss:\t {}".format(round(hamming_loss(category_label_test,output_labels),4)))
        compute_sub_accuracy(category_label_test,output_labels)
        print()


In [26]:
print_testing()
print(len(data_train))
print(len(data_test))

Treshold: 	0.3
Accuracy: 	0.5499
Precision:	 0.714175654853621
Hamming Loss:	 0.1919

Sub Accuracy
Audio    	 Accuracy: 0.8687 	Precision: 0.8182 	 Recall: 0.9431
Gameplay 	 Accuracy: 0.7486 	Precision: 0.6706 	 Recall: 0.8622
Graphics 	 Accuracy: 0.807 	Precision: 0.6057 	 Recall: 0.7316

Treshold: 	0.4
Accuracy: 	0.6128
Precision:	 0.7644483362521891
Hamming Loss:	 0.174

Sub Accuracy
Audio    	 Accuracy: 0.8765 	Precision: 0.8406 	 Recall: 0.9248
Gameplay 	 Accuracy: 0.7912 	Precision: 0.7483 	 Recall: 0.8045
Graphics 	 Accuracy: 0.8103 	Precision: 0.6348 	 Recall: 0.632

Treshold: 	0.5
Accuracy: 	0.6117
Precision:	 0.7870646766169154
Hamming Loss:	 0.1841

Sub Accuracy
Audio    	 Accuracy: 0.8597 	Precision: 0.8536 	 Recall: 0.8633
Gameplay 	 Accuracy: 0.798 	Precision: 0.7874 	 Recall: 0.7519
Graphics 	 Accuracy: 0.7901 	Precision: 0.6222 	 Recall: 0.4848

Treshold: 	0.6
Accuracy: 	0.6061
Precision:	 0.8299881936245572
Hamming Loss:	 0.1908

Sub Accuracy
Audio    	 Accuracy: 0.840

# Polarity Classifier

In [27]:
#Filter reviews that passed the initial classification
var_threshold = .6
category_label_test = np.array(list(i[0] for i in label_test))
pol_label_test = np.array(list(i[1] for i in label_test))
output_labels = []
for i in category_probability:
    output_labels.append(is_over_threshold(var_threshold,i))
# print(label_test[0])
# print(label_test[1][1])

In [28]:
#Count reviews with "X" category
t_total_audio    = 0
t_total_gameplay = 0
t_total_graphics = 0
for i in label_test:
    if i[0][0]:
        t_total_audio += 1
    if i[0][1]:
        t_total_gameplay += 1
    if i[0][2]:
        t_total_graphics += 1

In [29]:
test_audio_passed_text = []
test_audio_passed_pol = []

test_gameplay_passed_text = []
test_gameplay_passed_pol = []

test_graphics_passed_text = [] 
test_graphics_passed_pol = [] 


for i,j,k,l in zip(data_test,category_label_test,output_labels,pol_label_test):
    if(j[0] == k[0] == 1):
        test_audio_passed_text.append(i)
        test_audio_passed_pol.append(l[0])
    if(j[1] == k[1] == 1):
        test_gameplay_passed_text.append(i)
        test_gameplay_passed_pol.append(l[1])
    if(j[2] == k[2] == 1):
        test_graphics_passed_text.append(i) 
        test_graphics_passed_pol.append(l[2])

print("Passed Audio:\t\t {} out of {} ".format(len(test_audio_passed_text),t_total_audio))
print("Passed Gameplay:\t {} out of {}".format(len(test_gameplay_passed_text),t_total_gameplay))
print("Passed Graphics:\t {} out of {}".format(len(test_graphics_passed_text),t_total_graphics))

Passed Audio:		 352 out of 439 
Passed Gameplay:	 271 out of 399
Passed Graphics:	 80 out of 231


In [30]:
#Initialize Values
audio_data_train_text = []
audio_data_train_polarity = []

graphics_data_train_text = []
graphics_data_train_polarity = []

gameplay_data_train_text = []
gameplay_data_train_polarity = []

In [31]:
#Transfer values
for i,j in zip(data_train,label_train):
    temp_classifications = j[0]
    temp_polarity = j[1]
    if temp_classifications[0] == 1:
        audio_data_train_text.append(i)
        audio_data_train_polarity.append(temp_polarity[0])
    if temp_classifications[1] == 1:
        gameplay_data_train_text.append(i)
        gameplay_data_train_polarity.append(temp_polarity[1])
    if temp_classifications[2] == 1:
        graphics_data_train_text.append(i)
        graphics_data_train_polarity.append(temp_polarity[2])

In [32]:
#Vectorization
#Audio
pol_audio_tf = TfidfVectorizer(ngram_range=(1,1))
pol_audio_tf.fit(audio_data_train_text)
audio_data_train_tf = pol_audio_tf.transform(audio_data_train_text)
audio_data_test_tf = pol_audio_tf.transform(test_audio_passed_text)

#Gameplay
pol_gameplay_tf = TfidfVectorizer(ngram_range=(1,1))
pol_gameplay_tf.fit(gameplay_data_train_text)
gameplay_data_train_tf = pol_gameplay_tf.transform(gameplay_data_train_text)
gameplay_data_test_tf = pol_gameplay_tf.transform(test_gameplay_passed_text)

#Graphics
pol_graphics_tf = TfidfVectorizer(ngram_range=(1,1))
pol_graphics_tf.fit(graphics_data_train_text)
graphics_data_train_tf = pol_graphics_tf.transform(graphics_data_train_text)
graphics_data_test_tf = pol_graphics_tf.transform(test_graphics_passed_text)



In [33]:
def count_output(temp_list):
    temp_count_pos = 0
    temp_count_neu = 0
    temp_count_neg = 0
    for i in temp_list:
        if i == 1:
            temp_count_pos = temp_count_pos + 1
        elif i == 0:
            temp_count_neu = temp_count_neu + 1
        elif i == -1:
            temp_count_neg = temp_count_neg + 1

    print("Positive: {}"    .format(temp_count_pos))
    print("Neutral: {}"     .format(temp_count_neu))
    print("Negative: {}"    .format(temp_count_neg))
    print()
def pol_output(pred_output,label):
    category_output_dict = {}
    for i,j in zip(label,pred_output):
        temp_text = "{}_{}".format(i,j)
        if temp_text in category_output_dict:
            category_output_dict[temp_text] += 1
        else:
            category_output_dict[temp_text] = 1

    for key, value in sorted(category_output_dict.items()):
        print("{} {}".format(key,value))

In [34]:
pol_audio_lsvc = CalibratedClassifierCV(LinearSVC(multi_class='ovr'))
pol_audio_lsvc.fit(audio_data_train_tf,audio_data_train_polarity)
pol_audio_output = pol_audio_lsvc.predict(audio_data_test_tf)

pol_gameplay_lsvc = CalibratedClassifierCV(LinearSVC(multi_class='ovr'))
pol_gameplay_lsvc.fit(gameplay_data_train_tf,gameplay_data_train_polarity)
pol_gameplay_output = pol_gameplay_lsvc.predict(gameplay_data_test_tf)

pol_graphics_lsvc = CalibratedClassifierCV(LinearSVC(multi_class='ovr'))
pol_graphics_lsvc.fit(graphics_data_train_tf,graphics_data_train_polarity)
pol_graphics_output = pol_graphics_lsvc.predict(graphics_data_test_tf)

# print(pol_audio_output)
# print(pol_gameplay_output)
# print(pol_graphics_output)

In [35]:
def count_specific_pol(category,pol_list):
    print(category)
    pos = 0
    neg = 0
    neu = 0 
    for i in pol_list:
        if i == 1:
            pos += 1
        elif i == 0:
            neu += 1
        elif i == -1:
            neg += 1
    print("POS: {} NEU: {} NEG: {}".format(pos,neu,neg))
count_specific_pol("audio_train",audio_data_train_polarity)
count_specific_pol("audio_test",test_audio_passed_pol)
count_specific_pol("gameplay_train",gameplay_data_train_polarity)
count_specific_pol("gameplay_test",test_gameplay_passed_pol)
count_specific_pol("graphics_train",graphics_data_train_polarity)
count_specific_pol("graphics_test",test_graphics_passed_pol)

audio_train
POS: 886 NEU: 103 NEG: 34
audio_test
POS: 308 NEU: 35 NEG: 9
gameplay_train
POS: 578 NEU: 288 NEG: 68
gameplay_test
POS: 148 NEU: 103 NEG: 20
graphics_train
POS: 465 NEU: 69 NEG: 44
graphics_test
POS: 64 NEU: 8 NEG: 8


In [36]:
print("Audio")
print(accuracy_score(test_audio_passed_pol,pol_audio_output))
print("Gameplay")
print(accuracy_score(test_gameplay_passed_pol,pol_gameplay_output))
print("Graphics")
print(accuracy_score(test_graphics_passed_pol,pol_graphics_output))

Audio
0.9289772727272727
Gameplay
0.7084870848708487
Graphics
0.85


In [37]:
print(label_train[0])

[array([0, 0, 1]), [-2, -2, 1]]
