In [1]:
import pandas as pd
import numpy as np
import os
import timeit


# Building the Soft-SVM

In [2]:
def gradient_comp(X,y,C,w0):
    summands_of_X = X[X.dot(w0) * y <= 1]
    summands_of_y = y[summands_of_X.index]
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    gradient = w0 + C*summands.sum()
    return gradient

In [3]:
def stochastic_gradient_comp(X,y,C,w0,batch):
    Xsamp = X.sample(batch)
    ysamp = y[Xsamp.index]
    summands_of_X = Xsamp[Xsamp.dot(w0) * ysamp <= 1]
    summands_of_y = ysamp[summands_of_X.index]
    
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    gradient = w0 + C*(1/batch)*summands.sum()
    return gradient

In [14]:
def soft_SVM_training(X,y,C,w0,eps,lr,n,batch):
    # X is training data
    # y is labels
    # C is penalty of how hard to be
    # w is initial weights vector (np.array)
    # eps is convergence criterion
    # lr is learning rate
    
    # Computing gradient of L
    i = 0
    gradw = None
    if batch == None:
        gradw = gradient_comp(X,y,C,w0)
    else:
        gradw = stochastic_gradient_comp(X,y,C,w0,batch)
    w1 = w0 - lr*gradw
    while(np.linalg.norm(w0-w1) > eps):
        w0 = w1
        gradw = None
        if batch == None:
            gradw = gradient_comp(X,y,C,w0)
        else:
            gradw = stochastic_gradient_comp(X,y,C,w0,batch)
        w1 = w0 - lr*gradw
        i = i + 1
        if(i%100==0 and n==100):
            print(i)
        elif(i%1000==0 and n>1000):
            print(i)
        if i == n:
            break
    print(np.linalg.norm(w0-w1))
    return w1

In [5]:
def Testing_soft_SVM(X,y,w):
    # X is the X_test set (in the form of the training set). There should
    # be a column of ones at the same spot as there is in the training.
    # y is the y_test actual values
    # w is the out put weights from the soft_SVM_training
    vals = X.values.dot(w)
    predictions = pd.Series((vals/abs(vals)).astype(int))
    TP = sum(y[(predictions[(predictions > 0)].index)] > 0)
    TN = sum(y[predictions[(predictions < 0)].index] < 0)
    FP = sum(y[predictions[(predictions > 0)].index] < 0)
    FN = sum(y[predictions[(predictions < 0)].index] > 0)
#     print("TP",TP)
#     print("TN",TN)
#     print("FP",FP)
#     print("FN",FN)
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    #precision = TP/(TP+FP)
    #recall = TP/(TP+FN)
    #f1 = 2*(precision*recall)/(precision+recall)
    #return {"accuracy":accuracy,"precision":precision,"recall":recall, "f1":f1}
    #return {"accuracy":accuracy,"recall":recall}
    X.drop("ones",axis=1)
    return accuracy
    

In [16]:
def produce_confusion_matrix(y_pred, y_test):
    res = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

    res_positives = res[res["Actual"] == 1]
    res_negatives = res[res["Actual"] == -1]

    positives_dict = dict(res_positives["Prediction"].value_counts())
    TPs = positives_dict[1]
    FNs = positives_dict[-1]

    negatives_dict = dict(res_negatives["Prediction"].value_counts())
    TNs = negatives_dict[-1]
    FPs = negatives_dict[1]

    positives = pd.Series([TPs, FNs])
    negatives = pd.Series([FPs, TNs])

    confusion_matrix = pd.DataFrame({"Actually Positive": positives, "Actually Negative": negatives})
    confusion_matrix.rename({0: "Prediced Positive", 1: "Predicted Negative"}, inplace=True)
    return confusion_matrix

# Train and Test (Feature Set #1)

In [19]:
df_train = pd.read_csv("train1.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test1.csv").drop("Unnamed: 0",axis=1)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

In [20]:
#X_train_normalized = X_train.divide(np.sqrt((X_train**2).sum()), axis=1)
#X_test_normalized = X_test.divide(np.sqrt((X_test**2).sum()), axis=1)
X_train_sd = (X_train - X_train.mean())/(X_train.std())
X_test_sd = (X_test - X_train.mean())/(X_train.std())

X_train_sd["ones"] = [1]*len(y_train)
X_test_sd["ones"] = [1]*len(y_test)

In [9]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  25,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
0.0003953761107159405
Accuracy: 0.72972
Time (minutes) elapsed for this cell: 0.562403336816836


In [10]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  150,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
0.0039596215016009404
Accuracy: 0.73068
Time (minutes) elapsed for this cell: 0.5633023579003444


In [11]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  1,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
2.24416771227937e-05
Accuracy: 0.72744
Time (minutes) elapsed for this cell: 0.5637090736000876


In [12]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  1000,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
0.024428667489035594
Accuracy: 0.72976
Time (minutes) elapsed for this cell: 0.5649455571333722


In [13]:
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  10000,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.0001,
                  20000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
20000
0.04848468198627459
Accuracy: 0.73268
Time (minutes) elapsed for this cell: 1.1262075298165048


## Adding Interactions

In [21]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()
y_train2 = y_train.copy()
y_test2 = y_test.copy()

In [22]:
X_train2["Interaction_posc_negc"] = X_train2.Positive_counts*X_train2.Negative_counts
X_test2["Interaction_posc_negc"] = X_test2.Positive_counts*X_test2.Negative_counts
X_train_sd2 = (X_train2 - X_train2.mean())/(X_train2.std())
X_test_sd2 = (X_test2 - X_train2.mean())/(X_train2.std())

X_train_sd2["ones"] = [1]*len(y_train)
X_test_sd2["ones"] = [1]*len(y_test)

In [16]:

start_time = timeit.default_timer()

weight_sd2 = soft_SVM_training(X_train_sd2,
                  y_train,
                  150,
                  np.array(X_train_sd2.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd2,y_test,weight_sd2)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
0.0005271397520144507
Accuracy: 0.73452
Time (minutes) elapsed for this cell: 0.5795223824165684


# Squaring Terms

In [23]:
X_train3 = X_train2.copy()
X_test3 = X_test2.copy()
y_train3 = y_train2.copy()
y_test3 = y_test2.copy()

In [24]:
X_train3["Positive_counts2"] = X_train3.Positive_counts**2
X_train3["Negative_counts2"] = X_train3.Negative_counts**2
X_test3["Positive_counts2"] = X_test3.Positive_counts**2
X_test3["Negative_counts2"] = X_test3.Negative_counts**2
X_train_sd3 = (X_train3 - X_train3.mean())/(X_train3.std())
X_test_sd3 = (X_test3 - X_train3.mean())/(X_train3.std())

X_train_sd3["ones"] = [1]*len(y_train)
X_test_sd3["ones"] = [1]*len(y_test)

In [26]:
start_time = timeit.default_timer()

weight_sd3 = soft_SVM_training(X_train_sd3,
                  y_train,
                  150,
                  np.array(X_train_sd3.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.0002614761574683129
Accuracy: 0.73516
Time (minutes) elapsed for this cell: 1.0143996904832118


In [28]:
vals = X_test_sd3.values.dot(weight_sd3)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,9106,3227
Predicted Negative,3394,9273


# Cubing Terms

In [20]:
X_train4 = X_train3.copy()
X_test4 = X_test3.copy()
y_train4 = y_train3.copy()
y_test4 = y_test3.copy()

In [21]:
X_train4["Positive_counts3"] = X_train4.Positive_counts**3
X_train4["Negative_counts3"] = X_train4.Negative_counts**3
X_test4["Positive_counts3"] = X_test4.Positive_counts**3
X_test4["Negative_counts3"] = X_test4.Negative_counts**3
X_train_sd4 = (X_train4 - X_train4.mean())/(X_train4.std())
X_test_sd4 = (X_test4 - X_train4.mean())/(X_train4.std())

X_train_sd4["ones"] = [1]*len(y_train)
X_test_sd4["ones"] = [1]*len(y_test)

In [22]:
start_time = timeit.default_timer()

weight_sd4 = soft_SVM_training(X_train_sd4,
                  y_train,
                  150,
                  np.array(X_train_sd4.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd4,y_test,weight_sd4)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
0.0006851612798893962
Accuracy: 0.73476
Time (minutes) elapsed for this cell: 0.6413198845499816


**Hardly any improvement adding polynomial terms**

### Trying different values of C (using quadratic, because it has the best accuracy)

In [23]:
start_time = timeit.default_timer()
Cs = [1,5,10,25,50,150,350,500,1000,1500,1750]
accs = []
times = []
for c in Cs:
    weight_sd3 = soft_SVM_training(X_train_sd3,
                      y_train,
                      c,
                      np.array(X_train_sd3.mean()),
                      10**-6,
                      0.0001,
                      20000,
                      1000)
    acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
    accs.append(acc)
    times.append(timeit.default_timer() - start_time)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

10000
20000
1.3833856746813569e-05
10000
20000
1.7326273052926234e-05
10000
20000
4.54845185444014e-05
10000
20000
7.102209554068752e-05
10000
20000
0.0002847981853030444
10000
20000
0.0013214326228951478
10000
20000
0.0022622196084515806
10000
20000
0.0011513591720359545
10000
20000
0.010812663750522386
10000
20000
0.0026494517265343294
10000
20000
0.00842969403700761
Time (minutes) elapsed for this cell: 16.006369200866416


In [24]:
pd.DataFrame({"C":Cs,"Accuracy":accs,"Time":times})

Unnamed: 0,C,Accuracy,Time
0,1,0.66716,74.162477
1,5,0.72984,149.137039
2,10,0.72976,223.663704
3,25,0.7306,298.410578
4,50,0.7328,372.33734
5,150,0.73516,446.681412
6,350,0.73468,535.802368
7,500,0.73472,634.018696
8,1000,0.73472,734.267346
9,1500,0.7348,860.694729


In [98]:
# No batch
start_time = timeit.default_timer()

weight_sd4 = soft_SVM_training(X_train_sd3,
                  y_train,
                  1000,
                  np.array(X_train_sd3.mean()),
                  10**-6,
                  0.000001,
                  150000,
                  None)
acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
51000
52000
53000
54000
55000
56000
57000
58000
59000
60000
61000
62000
63000
64000
65000
66000
67000
68000
69000
70000
71000
72000
73000
74000
75000
76000
77000
78000
79000
80000
81000
82000
83000
84000
85000
86000
87000
88000
89000
90000
91000
92000
93000
94000
95000
96000
97000
98000
99000
100000
101000
102000
103000
104000
105000
106000
107000
108000
109000
110000
111000
112000
113000
114000
115000
116000
117000
118000
119000
120000
121000
122000
123000
124000
125000
126000
127000
128000
129000
130000
131000
132000
133000
134000
135000
136000
137000
138000
139000
140000
141000
142000
143000
144000
145000
146000
147000
148000
149000
150000
11.933529654333968
Accuracy: 0.73424
Time (minutes) elapse

Had the same accuracy even when the difference in error between two sequential iterations was more than 100

# Training and Test (Feature Test 2)

In [7]:
start_time = timeit.default_timer()

df_train = pd.read_csv("train2.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test2.csv").drop("Unnamed: 0",axis=1)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

Time (minutes) elapsed for this cell: 0.7005592003998269


In [8]:
# remove columns that are too sparse, allows LDA to perform matrix algebra
cols_to_drop = []
df = df_train.drop("Label", axis=1)
for col in df:
    num_vals = len(df) - df[col].value_counts()[0]
    if num_vals < 3:
        cols_to_drop.append(col)
len(cols_to_drop)

1039

In [9]:
df_train.drop(cols_to_drop, axis=1, inplace=True)
df_test.drop(cols_to_drop, axis=1, inplace=True)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

In [10]:
X_train["ones"] = [1]*len(y_train)
X_test["ones"] = [1]*len(y_test)

### SVM Classifier results

In [29]:
start_time = timeit.default_timer()

weights = soft_SVM_training(X_train,
                  y_train,
                  25,
                  np.array(X_train.mean()),
                  10^-3,
                  0.001,
                  5000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

0.004091286810136984
Accuracy: 0.8276
Time (minutes) elapsed for this cell: 10.32927629704985


In [30]:
vals = X_test.values.dot(weights)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,0.86096,0.20576
Predicted Negative,0.13904,0.79424


In [11]:
start_time = timeit.default_timer()

weights2 = soft_SVM_training(X_train,
                  y_train,
                  1500,
                  np.array(X_train.mean()),
                  10**-3,
                  0.001,
                  5000,
                  500)
acc = Testing_soft_SVM(X_test,y_test,weights2)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
0.3295595874177372
Accuracy: 0.88412
Time (minutes) elapsed for this cell: 7.464184439599921


In [17]:
vals = X_test.values.dot(weights2)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,10968,1365
Predicted Negative,1532,11135


In [15]:
start_time = timeit.default_timer()

weights3 = soft_SVM_training(X_train,
                  y_train,
                  1500,
                  np.array(X_train.mean()),
                  10**-3,
                  0.0001,
                  5000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
0.015890762658256286
Accuracy: 0.8786
Time (minutes) elapsed for this cell: 8.620412674133453


In [31]:
start_time = timeit.default_timer()

weights3 = soft_SVM_training(X_train,
                  y_train,
                  1500,
                  np.array(X_train.mean()),
                  10**-3,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
0.020859643567383616
TP 11098
TN 10989
FP 1511
FN 1402
Accuracy: 0.88348
Time (minutes) elapsed for this cell: 11.058828259466827


  import sys


In [33]:
vals = X_test.values.dot(weights3)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

  


Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,0.88784,0.12088
Predicted Negative,0.11184,0.87912


# -------------------------------------------------------------------------------------------

On Top Adjectives

In [103]:
top5000 = np.load("top5000.npy")

In [109]:
topAdj = (list(set(X_train.columns).intersection(set(top5000))))

In [114]:
X_train = X_train[topAdj]
X_test = X_test[topAdj]

In [119]:
start_time = timeit.default_timer()

weights1 = soft_SVM_training(X_train,
                  y_train,
                  1250,
                  np.array(X_train.mean()),
                  10**-5,
                  0.0001,
                  50000,
                  1000)
acc = Testing_soft_SVM(X_test,y_test,weights1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
50000
0.013876290366606755
Accuracy: 0.87648
Time (minutes) elapsed for this cell: 30.938692168133276


  import sys
