In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import requests
import os

# Building the Soft-SVM

In [2]:
def gradient_comp(X,y,C,w0):
    summands_of_X = X[X.dot(w0) * y <= 1]
    summands_of_y = y[summands_of_X.index]
#     Y = []
#     for i in range(summands_of_X.shape[1]):
#         Y.append(list(summands_of_y))
#     summands = -1*np.multiply(pd.DataFrame(Y).values.T,summands_of_X)
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    gradient = w0 + C*summands.sum()
    return gradient

In [3]:
def stochastic_gradient_comp(X,y,C,w0,batch):
    Xsamp = X.sample(batch)
    ysamp = y[Xsamp.index]
    summands_of_X = Xsamp[Xsamp.dot(w0) * ysamp <= 1]
    summands_of_y = ysamp[summands_of_X.index]
    #Y = []
    #for i in range(summands_of_X.shape[1]):
        #Y.append(list(summands_of_y))
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    #summands = -1*np.multiply(pd.DataFrame(Y).values.T,summands_of_X)
    gradient = w0 + C*(1/batch)*summands.sum()
    return gradient

In [4]:
def soft_SVM_training(X,y,C,w0,eps,lr,n,batch):
    # X is training data
    # y is labels
    # C is penalty of how hard to be
    # w is initial weights vector (np.array)
    # eps is convergence criterion
    # lr is learning rate
    
    # Computing gradient of L
    i = 0
    gradw = stochastic_gradient_comp(X,y,C,w0,batch)
    w1 = w0 - lr*gradw
    while(np.linalg.norm(w0-w1) > eps):
        w0 = w1
        w1 = w0 - lr*gradient_comp(X,y,C,w0)
        i = i + 1
        if(i%10==0):
            print(i)
        if i == n:
            break
    print(np.linalg.norm(w0-w1))
    return w1

In [5]:
def Testing_soft_SVM(X,y,w):
    # X is the X_test set (in the form of the training set). There should
    # be a column of ones at the same spot as there is in the training.
    # y is the y_test actual values
    # w is the out put weights from the soft_SVM_training
    vals = X.values.dot(w)
    predictions = pd.Series((vals/abs(vals)).astype(int))
    TP = sum(y[(predictions[(predictions > 0)].index)] > 0)
    TN = sum(y[predictions[(predictions < 0)].index] < 0)
    FP = sum(y[predictions[(predictions > 0)].index] < 0)
    FN = sum(y[predictions[(predictions < 0)].index] > 0)
    print("TP",TP)
    print("TN",TN)
    print("FP",FP)
    print("FN",FN)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    precision = TP/(TP+FP)
    recall = TP/(TP+FN)
    f1 = 2*(precision*recall)/(precision+recall)
    return {"accuracy":accuracy,"precision":precision,"recall":recall, "f1":f1}
    #return {"accuracy":accuracy,"recall":recall}
    

# Train

In [6]:
os.chdir("..")

In [7]:
training = pd.read_csv("tf_idf_train2.csv")

In [8]:
training = training.drop(["Unnamed: 0"],axis=1)
training.head()

Unnamed: 0,life,comedy,whole,reality,cartoon,good,also,stories,behaved,cross,...,amazons-dr,now-over,creature-basically,mud-wrestling,multi-sexual,barabarian,gamorrean,d-grade,highly-entertaining,Label
0,1.734095,2.38033,2.250752,3.455865,4.257334,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,2.38033,0.0,0.0,0.0,2.900542,2.707422,3.329807,7.082109,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.675593,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,1.353711,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [None]:
testing = pd.read_csv("tf_idf_test2.csv").drop(["Unnamed: 0"],axis=1)
testing.head()

In [21]:
intersection = list(set(training.columns).intersection(set(testing.columns)))

In [23]:
X_train = training[intersection].drop("Label",axis=1)
y_train = training["Label"]

In [24]:
X_test = testing[intersection].drop("Label",axis=1)
y_test = testing["Label"]

In [25]:
X_train.head()

Unnamed: 0,commission,b-actor,communicative,humblest,paycheck,hyena,brits,louvre,imitators,gonna,...,hospitals,dishonored,cutaway,haaga,sagging,spurned,amazement,typed,assassin,shutter
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
X_test.head()

Unnamed: 0,commission,b-actor,communicative,humblest,paycheck,hyena,brits,louvre,imitators,gonna,...,hospitals,dishonored,cutaway,haaga,sagging,spurned,amazement,typed,assassin,shutter
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
w = soft_SVM_training(X_train,y_train,30,X_train.mean().T,10**-3,0.05,200,128)

10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
30142.37849611161


# Test

In [32]:
Testing_soft_SVM(X_test,y_test,w)

TP 11813
TN 3763
FP 8737
FN 687


  import sys


{'accuracy': 0.62304,
 'precision': 0.5748418491484185,
 'recall': 0.94504,
 'f1': 0.7148562783661119}