In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import math
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score,hamming_loss

from skmultilearn.problem_transform import ClassifierChain

import pickle

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_train_nostemstop = pd.read_csv("../Data/Data_train_nostemstop_tfidf.csv")
data_test_nostemstop = pd.read_csv("../Data/Data_test_nostemstop_tfidf.csv")

In [3]:
X_train_nostemstop = data_train_nostemstop.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_train_nostemstop = data_train_nostemstop[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

X_test_nostemstop = data_test_nostemstop.drop(['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong'],axis=1)
y_test_nostemstop = data_test_nostemstop[['HS','Abusive','HS_Individual','HS_Group','HS_Religion','HS_Race','HS_Physical','HS_Gender','HS_Other','HS_Weak','HS_Moderate','HS_Strong']]

In [4]:
#default hyperparameter
C = 10
kernel = "rbf"
gamma = 0.1

In [5]:
classifier_CC = ClassifierChain(SVC(C=C, gamma = gamma, kernel = kernel))

In [12]:
kernel_param = ['linear','poly','sigmoid','rbf']
C_param = [.01, .1, 10, 100]
gamma_param = [.0001, .001, .01, .1, 1, 10, 100]

In [13]:
param = {
    "kernel" : kernel_param,
    "C" : C_param,
    "gamma" : gamma_param
}

In [18]:
def labelSetAccuracy(y_true, y_pred):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
#         print('\nset_true: {0}'.format(set_true))
#         print('set_pred: {0}'.format(set_pred))
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        #print('tmp_a: {0}'.format(tmp_a))
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [19]:
for i in param:
    if i == "kernel":
        acc_kernel = []
        for j in param[i]:
            kernel = j
            classifier_CC = ClassifierChain(SVC(C=C, gamma = gamma, kernel = kernel))
            classifier_CC.fit(X_train_nostemstop,y_train_nostemstop)
            pickle.dump(classifier_CC, open("../svm_model/"+str(kernel)+".sav", 'wb'))
            pred = classifier_CC.predict(X_test_nostemstop)
            acc_kernel.append(labelSetAccuracy(y_test_nostemstop.values,pred.toarray()))
        print("acc kernel = ", acc_kernel)
        kernel = param[i][np.array(acc_kernel).argmax()]
        print("best kernel = ",kernel)
    elif i == "C":
        acc_C = []
        for j in param[i]:
            C = j
            classifier_CC = ClassifierChain(SVC(C=C, gamma = gamma, kernel = kernel))
            classifier_CC.fit(X_train_nostemstop,y_train_nostemstop)
            pickle.dump(classifier_CC, open("../svm_model/C_"+str(C)+".sav", 'wb'))
            pred = classifier_CC.predict(X_test_nostemstop)
            acc_C.append(labelSetAccuracy(y_test_nostemstop.values,pred.toarray()))
        print("acc C = ", acc_C)
        C = param[i][np.array(acc_C).argmax()]
        print("best C = ",C)
    else:
        acc_gamma = []
        for j in param[i]:
            gamma = j
            classifier_CC = ClassifierChain(SVC(C=C, gamma = gamma, kernel = kernel))
            classifier_CC.fit(X_train_nostemstop,y_train_nostemstop)
            pickle.dump(classifier_CC, open("../svm_model/gamma_"+str(gamma)+".sav", 'wb'))
            pred = classifier_CC.predict(X_test_nostemstop)
            acc_gamma.append(labelSetAccuracy(y_test_nostemstop.values,pred.toarray()))
        print("acc gamma = ", acc_gamma)
        gamma = param[i][np.array(acc_gamma).argmax()]
        print("best gamma = ",gamma)

acc kernel =  [0.713036037784679, 0.39945652173913043, 0.7487580874741201, 0.7486073369565217]
best kernel =  sigmoid
acc C =  [0.39945652173913043, 0.4009239130434783, 0.7487580874741201, 0.6902910412353348]
best C =  10
acc gamma =  [0.39945652173913043, 0.4009239130434783, 0.6677140484817117, 0.7487580874741201, 0.47247064001819433, 0.39001520445134574, 0.3172634144237405]
best gamma =  0.1


In [26]:
C_100 = pickle.load(open("../svm_model/C_100.sav", 'rb'))
pred_C100 = C_100.predict(X_train_nostemstop)

In [29]:
labelSetAccuracy(y_train_nostemstop.values,pred_C100.toarray())

0.8962068178222663

In [37]:
pred_C100_test = C_100.predict(X_test_nostemstop)
labelSetAccuracy(y_test_nostemstop.values,pred_C100_test.toarray())

0.6902910412353348

In [42]:
print("acc HS C 100 (train) = ",accuracy_score(y_train_nostemstop['HS'],[i[0] for i in pred_C100.toarray()]))
print("acc HS C 100 (test) = ",accuracy_score(y_test_nostemstop['HS'],[i[0] for i in pred_C100_test.toarray()]))

acc HS C 100 (train) =  0.9254756871035941
acc HS C 100 (test) =  0.79375


In [30]:
gamma_100 = pickle.load(open("../svm_model/gamma_100.sav", 'rb'))
pred_gamma100 = gamma_100.predict(X_train_nostemstop)
labelSetAccuracy(y_train_nostemstop.values,pred_gamma100.toarray())

0.31856396187791536

In [38]:
pred_gamma100_test = gamma_100.predict(X_test_nostemstop)
labelSetAccuracy(y_test_nostemstop.values,pred_gamma100_test.toarray())

0.3172634144237405

In [43]:
print("acc HS gamma 100 (train) = ",accuracy_score(y_train_nostemstop['HS'],[i[0] for i in pred_gamma100.toarray()]))
print("acc HS gamma 100 (test) = ",accuracy_score(y_test_nostemstop['HS'],[i[0] for i in pred_gamma100_test.toarray()]))

acc HS gamma 100 (train) =  0.5530051344004833
acc HS gamma 100 (test) =  0.5483695652173913


In [36]:
gamma_01 = pickle.load(open("../svm_model/gamma_0.1.sav", 'rb'))
pred_gamma01 = gamma_01.predict(X_train_nostemstop)
labelSetAccuracy(y_train_nostemstop.values,pred_gamma01.toarray())

0.8294475965157029

In [39]:
pred_gamma01_test = gamma_01.predict(X_test_nostemstop)
labelSetAccuracy(y_test_nostemstop.values,pred_gamma01_test.toarray())

0.7487580874741201

In [44]:
print("acc HS gamma 0.1 (train) = ",accuracy_score(y_train_nostemstop['HS'],[i[0] for i in pred_gamma01.toarray()]))
print("acc HS gamma 0.1 (test) = ",accuracy_score(y_test_nostemstop['HS'],[i[0] for i in pred_gamma01_test.toarray()]))

acc HS gamma 0.1 (train) =  0.8752642706131079
acc HS gamma 0.1 (test) =  0.8230978260869565
