In [1]:
import pandas as pd
import numpy as np
import os
import timeit


# Building the Soft-SVM

In [47]:
def gradient_comp(X,y,C,w0):
    summands_of_X = X[X.dot(w0) * y <= 1]
    summands_of_y = y[summands_of_X.index]
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    gradient = w0 + C*summands.sum()
    return gradient

In [11]:
def stochastic_gradient_comp(X,y,C,w0,batch):
    Xsamp = X.sample(batch)
    ysamp = y[Xsamp.index]
    summands_of_X = Xsamp[Xsamp.dot(w0) * ysamp <= 1]
    summands_of_y = ysamp[summands_of_X.index]
    
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    gradient = w0 + C*(1/batch)*summands.sum()
    return gradient

In [96]:
def soft_SVM_training(X,y,C,w0,eps,lr,n,batch):
    # X is training data
    # y is labels
    # C is penalty of how hard to be
    # w is initial weights vector (np.array)
    # eps is convergence criterion
    # lr is learning rate
    
    # Computing gradient of L
    i = 0
    gradw = None
    if batch == None:
        gradw = gradient_comp(X,y,C,w0)
    else:
        gradw = stochastic_gradient_comp(X,y,C,w0,batch)
    w1 = w0 - lr*gradw
    while(np.linalg.norm(w0-w1) > eps):
        w0 = w1
        gradw = None
        if batch == None:
            gradw = gradient_comp(X,y,C,w0)
        else:
            gradw = stochastic_gradient_comp(X,y,C,w0,batch)
        w1 = w0 - lr*gradw
        i = i + 1
        if(i%10==0 and n<=100):
            print(i)
        elif(i%1000==0 and n>=1000):
            print(i)
        if i == n:
            break
    print(np.linalg.norm(w0-w1))
    return w1

In [87]:
def Testing_soft_SVM(X,y,w):
    # X is the X_test set (in the form of the training set). There should
    # be a column of ones at the same spot as there is in the training.
    # y is the y_test actual values
    # w is the out put weights from the soft_SVM_training
    vals = X.values.dot(w)
    predictions = pd.Series((vals/abs(vals)).astype(int))
    TP = sum(y[(predictions[(predictions > 0)].index)] > 0)
    TN = sum(y[predictions[(predictions < 0)].index] < 0)
    FP = sum(y[predictions[(predictions > 0)].index] < 0)
    FN = sum(y[predictions[(predictions < 0)].index] > 0)
#     print("TP",TP)
#     print("TN",TN)
#     print("FP",FP)
#     print("FN",FN)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    #precision = TP/(TP+FP)
    #recall = TP/(TP+FN)
    #f1 = 2*(precision*recall)/(precision+recall)
    #return {"accuracy":accuracy,"precision":precision,"recall":recall, "f1":f1}
    #return {"accuracy":accuracy,"recall":recall}
    return accuracy
    

In [6]:
def produce_confusion_matrix(y_pred, y_test):
    res = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

    res_positives = res[res["Actual"] == 1]
    res_negatives = res[res["Actual"] == -1]

    positives_dict = dict(res_positives["Prediction"].value_counts(normalize=True))
    TPs = positives_dict[1]
    FNs = positives_dict[-1]

    negatives_dict = dict(res_negatives["Prediction"].value_counts(normalize=True))
    TNs = negatives_dict[-1]
    FPs = negatives_dict[1]

    positives = pd.Series([TPs, FNs])
    negatives = pd.Series([FPs, TNs])

    confusion_matrix = pd.DataFrame({"Actually Positive": positives, "Actually Negative": negatives})
    confusion_matrix.rename({0: "Prediced Positive", 1: "Predicted Negative"}, inplace=True)
    return confusion_matrix

# Train and Test (Feature Set #1)

In [35]:
df_train = pd.read_csv("train1.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test1.csv").drop("Unnamed: 0",axis=1)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

In [38]:
#X_train_normalized = X_train.divide(np.sqrt((X_train**2).sum()), axis=1)
#X_test_normalized = X_test.divide(np.sqrt((X_test**2).sum()), axis=1)
X_train_sd = (X_train - X_train.mean())/(X_train.std())
X_test_sd = (X_test - X_train.mean())/(X_train.std())

In [52]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  25,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.0007001081792641864
TP 8865
TN 9378
FP 3122
FN 3635
Accuracy: 0.72972
Time (minutes) elapsed for this cell: 0.43486447876663686


In [53]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  150,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.0036440813969985026
TP 8867
TN 9374
FP 3126
FN 3633
Accuracy: 0.72964
Time (minutes) elapsed for this cell: 0.43294754581681144


In [54]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  1,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
5.200000983334727e-07
TP 8739
TN 9441
FP 3059
FN 3761
Accuracy: 0.7272
Time (minutes) elapsed for this cell: 0.12406399741666974


In [55]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  1000,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.006075658060507137
TP 8990
TN 9281
FP 3219
FN 3510
Accuracy: 0.73084
Time (minutes) elapsed for this cell: 0.43654932163335614


In [58]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  10000,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.0001,
                  20000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
0.019695184945126284
TP 8900
TN 9347
FP 3153
FN 3600
Accuracy: 0.72988
Time (minutes) elapsed for this cell: 0.8655917999999171


## Adding Interactions

In [60]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()
y_train2 = y_train.copy()
y_test2 = y_test.copy()

In [66]:
X_train2["Interaction_posc_negc"] = X_train2.Positive_counts*X_train2.Negative_counts
X_test2["Interaction_posc_negc"] = X_test2.Positive_counts*X_test2.Negative_counts
X_train_sd2 = (X_train2 - X_train2.mean())/(X_train2.std())
X_test_sd2 = (X_test2 - X_train2.mean())/(X_train2.std())

In [74]:

start_time = timeit.default_timer()

weight_sd2 = soft_SVM_training(X_train_sd2,
                  y_train,
                  150,
                  np.array(X_train_sd2.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd2,y_test,weight_sd2)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.0006113793043249433
TP 9140
TN 9239
FP 3261
FN 3360
Accuracy: 0.73516
Time (minutes) elapsed for this cell: 0.4422957272666584


# Squaring Terms

In [75]:
X_train3 = X_train2.copy()
X_test3 = X_test2.copy()
y_train3 = y_train2.copy()
y_test3 = y_test2.copy()

In [78]:
X_train3["Positive_counts2"] = X_train3.Positive_counts**2
X_train3["Negative_counts2"] = X_train3.Negative_counts**2
X_test3["Positive_counts2"] = X_test3.Positive_counts**2
X_test3["Negative_counts2"] = X_test3.Negative_counts**2
X_train_sd3 = (X_train3 - X_train3.mean())/(X_train3.std())
X_test_sd3 = (X_test3 - X_train3.mean())/(X_train3.std())

In [81]:
start_time = timeit.default_timer()

weight_sd3 = soft_SVM_training(X_train_sd3,
                  y_train,
                  150,
                  np.array(X_train_sd3.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.0002945298976926046
TP 9104
TN 9275
FP 3225
FN 3396
Accuracy: 0.73516
Time (minutes) elapsed for this cell: 0.44680415056670125


# Cubing Terms

In [82]:
X_train4 = X_train3.copy()
X_test4 = X_test3.copy()
y_train4 = y_train3.copy()
y_test4 = y_test3.copy()

In [83]:
X_train4["Positive_counts3"] = X_train4.Positive_counts**3
X_train4["Negative_counts3"] = X_train4.Negative_counts**3
X_test4["Positive_counts3"] = X_test4.Positive_counts**3
X_test4["Negative_counts3"] = X_test4.Negative_counts**3
X_train_sd4 = (X_train4 - X_train4.mean())/(X_train4.std())
X_test_sd4 = (X_test4 - X_train4.mean())/(X_train4.std())

In [85]:
start_time = timeit.default_timer()

weight_sd4 = soft_SVM_training(X_train_sd4,
                  y_train,
                  150,
                  np.array(X_train_sd4.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd4,y_test,weight_sd4)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.0006261287110735965
TP 9136
TN 9229
FP 3271
FN 3364
Accuracy: 0.7346
Time (minutes) elapsed for this cell: 0.4537553986000906


**Hardly any improvement adding polynomial terms**

### Trying different values of C (using quadratic, because it has the best accuracy)

In [88]:
start_time = timeit.default_timer()
Cs = [1,5,10,25,50,150,350,500,1000,1500,1750]
accs = []
times = []
for c in Cs:
    weight_sd3 = soft_SVM_training(X_train_sd3,
                      y_train,
                      c,
                      np.array(X_train_sd3.mean()),
                      10**-6,
                      0.0001,
                      20000,
                      1000)
    acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
    accs.append(acc)
    times.append(timeit.default_timer() - start_time)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
5.738556309505917e-07
10000
20000
2.3242373066185544e-05
10000
20000
2.548224789254681e-05
10000
20000
0.0001656425371538648
10000
20000
0.0002773862394569442
10000
20000
0.0014086715312946248
10000
20000
0.000810887183178492
10000
20000
0.002435974741337127
10000
20000
0.004290685626044838
10000
20000
0.004586156074943308
10000
20000
0.0050250634135746266
Time (minutes) elapsed for this cell: 9.858914132850138


In [89]:
pd.DataFrame({"C":Cs,"Accuracy":accs,"Time":times})

Unnamed: 0,C,Accuracy,Time
0,1,0.72452,50.142342
1,5,0.72972,103.904405
2,10,0.73016,158.245718
3,25,0.7316,212.474885
4,50,0.73504,266.753621
5,150,0.7352,320.914402
6,350,0.73472,375.090945
7,500,0.7348,429.015493
8,1000,0.73484,483.153378
9,1500,0.7346,537.326285


In [98]:
# No batch
start_time = timeit.default_timer()

weight_sd4 = soft_SVM_training(X_train_sd3,
                  y_train,
                  1000,
                  np.array(X_train_sd3.mean()),
                  10**-6,
                  0.000001,
                  150000,
                  None)
acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
11.933529654333968
Accuracy: 0.73424
Time (minutes) elapse

Had the same accuracy even when the difference in error between two sequential iterations was more than 100

# Training and Test (Feature Test 2)

In [7]:
start_time = timeit.default_timer()

df_train = pd.read_csv("train2.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test2.csv").drop("Unnamed: 0",axis=1)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

Time (minutes) elapsed for this cell: 0.5163965353000094


In [8]:
# remove columns that are too sparse, allows LDA to perform matrix algebra
cols_to_drop = []
df = df_train.drop("Label", axis=1)
for col in df:
    num_vals = len(df) - df[col].value_counts()[0]
    if num_vals < 3:
        cols_to_drop.append(col)
len(cols_to_drop)

1039

In [9]:
df_train.drop(cols_to_drop, axis=1, inplace=True)
df_test.drop(cols_to_drop, axis=1, inplace=True)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

### SVM Classifier results

In [20]:
start_time = timeit.default_timer()

weights = soft_SVM_training(X_train,
                  y_train,
                  25,
                  np.array(X_train.mean()),
                  10^-3,
                  0.001,
                  5000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
0.0038380259567057187
TP 10550
TN 10165
FP 2335
FN 1950
Accuracy: 0.8286
Time (minutes) elapsed for this cell: 5.8893358464166035


  import sys


In [21]:
vals = X_test.values.dot(weights)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

  


Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,0.844,0.1868
Predicted Negative,0.15568,0.8132


In [27]:
start_time = timeit.default_timer()

weights2 = soft_SVM_training(X_train,
                  y_train,
                  1500,
                  np.array(X_train.mean()),
                  10**-3,
                  0.001,
                  5000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights2)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
0.213008348268269
TP 11449
TN 10628
FP 1872
FN 1051
Accuracy: 0.88308
Time (minutes) elapsed for this cell: 5.59931705813336


  import sys


In [28]:
vals = X_test.values.dot(weights2)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

  


Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,0.91592,0.14976
Predicted Negative,0.08376,0.85024


In [29]:
start_time = timeit.default_timer()

weights3 = soft_SVM_training(X_train,
                  y_train,
                  1500,
                  np.array(X_train.mean()),
                  10**-3,
                  0.0001,
                  5000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
0.0160724125137486
TP 11070
TN 10903
FP 1597
FN 1430
Accuracy: 0.87892
Time (minutes) elapsed for this cell: 5.591740460850027


  import sys


In [31]:
start_time = timeit.default_timer()

weights3 = soft_SVM_training(X_train,
                  y_train,
                  1500,
                  np.array(X_train.mean()),
                  10**-3,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.020859643567383616
TP 11098
TN 10989
FP 1511
FN 1402
Accuracy: 0.88348
Time (minutes) elapsed for this cell: 11.058828259466827


  import sys


In [33]:
vals = X_test.values.dot(weights3)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

  


Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,0.88784,0.12088
Predicted Negative,0.11184,0.87912


# -------------------------------------------------------------------------------------------