In [1]:
import pandas as pd
import numpy as np
import os
import timeit


# Building the Soft-SVM

In [2]:
def gradient_comp(X,y,C,w0):
    summands_of_X = X[X.dot(w0) * y <= 1]
    summands_of_y = y[summands_of_X.index]
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    gradient = w0 + C*summands.sum()
    return gradient

In [3]:
def stochastic_gradient_comp(X,y,C,w0,batch):
    Xsamp = X.sample(batch)
    ysamp = y[Xsamp.index]
    summands_of_X = Xsamp[Xsamp.dot(w0) * ysamp <= 1]
    summands_of_y = ysamp[summands_of_X.index]
    
    summands = -1*summands_of_X.multiply(summands_of_y,axis=0)
    gradient = w0 + C*(1/batch)*summands.sum()
    return gradient

In [4]:
def soft_SVM_training(X,y,C,w0,eps,lr,n,batch):
    # X is training data
    # y is labels
    # C is penalty of how hard to be
    # w is initial weights vector (np.array)
    # eps is convergence criterion
    # lr is learning rate
    # n is the number of iterations
    # batch is the subset size of X, if None then the entire data set will be used
    
    # Computing gradient of L
    i = 0
    gradw = None
    if batch == None:
        gradw = gradient_comp(X,y,C,w0)
    else:
        gradw = stochastic_gradient_comp(X,y,C,w0,batch)
    w1 = w0 - lr*gradw
    while(np.linalg.norm(w0-w1) > eps):
        w0 = w1
        gradw = None
        if batch == None:
            gradw = gradient_comp(X,y,C,w0)
        else:
            gradw = stochastic_gradient_comp(X,y,C,w0,batch)
        w1 = w0 - lr*gradw
        i = i + 1
        if i == n:
            break

    print("w_n - w_(n-1):", np.linalg.norm(w0-w1))
    return w1

In [5]:
def Testing_soft_SVM(X,y,w):
    # X is the X_test set (in the form of the training set). There should
    # be a column of ones at the same spot as there is in the training.
    # y is the y_test actual values
    # w is the out put weights from the soft_SVM_training
    vals = X.values.dot(w)
    predictions = pd.Series((vals/abs(vals)).astype(int))
    TP = sum(y[(predictions[(predictions > 0)].index)] > 0)
    TN = sum(y[predictions[(predictions < 0)].index] < 0)
    FP = sum(y[predictions[(predictions > 0)].index] < 0)
    FN = sum(y[predictions[(predictions < 0)].index] > 0)

    accuracy = (TP+TN)/(TP+TN+FP+FN)

    return accuracy
    

In [6]:
def produce_confusion_matrix(y_pred, y_test):
    res = pd.DataFrame({"Prediction": y_pred, "Actual": y_test})

    res_positives = res[res["Actual"] == 1]
    res_negatives = res[res["Actual"] == -1]

    positives_dict = dict(res_positives["Prediction"].value_counts())
    TPs = positives_dict[1]
    FNs = positives_dict[-1]

    negatives_dict = dict(res_negatives["Prediction"].value_counts())
    TNs = negatives_dict[-1]
    FPs = negatives_dict[1]

    positives = pd.Series([TPs, FNs])
    negatives = pd.Series([FPs, TNs])

    confusion_matrix = pd.DataFrame({"Actually Positive": positives, "Actually Negative": negatives})
    confusion_matrix.rename({0: "Prediced Positive", 1: "Predicted Negative"}, inplace=True)
    return confusion_matrix

# Stanford Given Features

In [7]:
training = pd.read_csv("standford_train.csv").drop(["Unnamed: 0"],axis=1)
testing = pd.read_csv("stanford_test.csv").drop(["Unnamed: 0"],axis=1)

In [8]:
X_train = training.drop("Label",axis=1)
X_train["ones"] = [1]*len(X_train)
y_train = training["Label"]

In [9]:
X_test = testing.drop("Label",axis=1)
X_test["ones"] = [1]*len(X_test)
y_test = testing["Label"]

In [10]:
start_time = timeit.default_timer()
Cs = [1,30,100]
accs = []
times = []
weights = []
for c in Cs:
    w = soft_SVM_training(X_train,
                      y_train,
                      c,
                      np.array(X_train.mean()),
                      10**-6,
                      0.001,
                      200,
                      128)
    acc = Testing_soft_SVM(X_test,y_test,w)
    accs.append(acc)
    times.append(timeit.default_timer() - start_time)
    weights.append(w)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.0032790716895529284
w_n - w_(n-1): 0.03629289947973378
w_n - w_(n-1): 0.11150377725292984
Time (minutes) elapsed for this cell: 3.494750034916797


In [11]:
pd.DataFrame({"C":Cs,"Accuracy":accs,"Time":times})

Unnamed: 0,C,Accuracy,Time
0,1,0.5,69.781867
1,30,0.70908,139.660258
2,100,0.70792,209.684883


In [12]:
maximum = accs.index(max(accs))
vals = X_test.values.dot(weights[maximum])
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,9795,4568
Predicted Negative,2705,7932


# Train and Test (Feature Set #1)

In [7]:
df_train = pd.read_csv("train1.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test1.csv").drop("Unnamed: 0",axis=1)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

In [8]:
X_train_sd = (X_train - X_train.mean())/(X_train.std())
X_test_sd = (X_test - X_train.mean())/(X_train.std())

X_train_sd["ones"] = [1]*len(y_train)
X_test_sd["ones"] = [1]*len(y_test)

In [15]:
# C = 25
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  25,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.0011563902399523143
Accuracy: 0.72972
Time (minutes) elapsed for this cell: 0.5540998834330821


In [17]:
vals = X_test_sd.values.dot(weight_sd1)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,8865,3122
Predicted Negative,3635,9378


In [18]:
# C = 150
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  150,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.008085770550217664
Accuracy: 0.73064
Time (minutes) elapsed for this cell: 0.5540432112500032


In [20]:
vals = X_test_sd.values.dot(weight_sd1)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,8982,3216
Predicted Negative,3518,9284


In [21]:
# C = 1
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  1,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 7.051699602911189e-05
Accuracy: 0.7274
Time (minutes) elapsed for this cell: 0.5563155916834754


In [22]:
vals = X_test_sd.values.dot(weight_sd1)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,8730,3045
Predicted Negative,3770,9455


In [23]:
# C = 1000
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  1000,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.047609054645475345
Accuracy: 0.73264
Time (minutes) elapsed for this cell: 0.5543792904834846


In [24]:
vals = X_test_sd.values.dot(weight_sd1)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,9086,3270
Predicted Negative,3414,9230


In [25]:
# C = 10000
start_time = timeit.default_timer()

weight_sd1 = soft_SVM_training(X_train_sd,
                  y_train,
                  10000,
                  np.array(X_train_sd.mean()),
                  10**-6,
                  0.0001,
                  20000,
                  1000)
acc = Testing_soft_SVM(X_test_sd,y_test,weight_sd1)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.012709766177085106
Accuracy: 0.7298
Time (minutes) elapsed for this cell: 1.108603431083126


In [26]:
vals = X_test_sd.values.dot(weight_sd1)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,8844,3099
Predicted Negative,3656,9401


## Adding Interactions

In [9]:
X_train2 = X_train.copy()
X_test2 = X_test.copy()
y_train2 = y_train.copy()
y_test2 = y_test.copy()

In [10]:
X_train2["Interaction_posc_negc"] = X_train2.Positive_counts*X_train2.Negative_counts
X_test2["Interaction_posc_negc"] = X_test2.Positive_counts*X_test2.Negative_counts
X_train_sd2 = (X_train2 - X_train2.mean())/(X_train2.std())
X_test_sd2 = (X_test2 - X_train2.mean())/(X_train2.std())

X_train_sd2["ones"] = [1]*len(y_train)
X_test_sd2["ones"] = [1]*len(y_test)

In [29]:

start_time = timeit.default_timer()

weight_sd2 = soft_SVM_training(X_train_sd2,
                  y_train,
                  150,
                  np.array(X_train_sd2.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd2,y_test,weight_sd2)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.0006156863923824839
Accuracy: 0.73452
Time (minutes) elapsed for this cell: 0.5665323058667127


In [31]:
vals = X_test_sd2.values.dot(weight_sd2)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,9171,3308
Predicted Negative,3329,9192


# Squaring Terms

In [11]:
X_train3 = X_train2.copy()
X_test3 = X_test2.copy()
y_train3 = y_train2.copy()
y_test3 = y_test2.copy()

In [12]:
X_train3["Positive_counts2"] = X_train3.Positive_counts**2
X_train3["Negative_counts2"] = X_train3.Negative_counts**2
X_test3["Positive_counts2"] = X_test3.Positive_counts**2
X_test3["Negative_counts2"] = X_test3.Negative_counts**2
X_train_sd3 = (X_train3 - X_train3.mean())/(X_train3.std())
X_test_sd3 = (X_test3 - X_train3.mean())/(X_train3.std())

X_train_sd3["ones"] = [1]*len(y_train)
X_test_sd3["ones"] = [1]*len(y_test)

In [34]:
start_time = timeit.default_timer()

weight_sd3 = soft_SVM_training(X_train_sd3,
                  y_train,
                  150,
                  np.array(X_train_sd3.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.00043466258031655023
Accuracy: 0.7352
Time (minutes) elapsed for this cell: 0.5987629729667484


In [35]:
vals = X_test_sd3.values.dot(weight_sd3)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,9114,3234
Predicted Negative,3386,9266


# Cubing Terms

In [36]:
X_train4 = X_train3.copy()
X_test4 = X_test3.copy()
y_train4 = y_train3.copy()
y_test4 = y_test3.copy()

In [37]:
X_train4["Positive_counts3"] = X_train4.Positive_counts**3
X_train4["Negative_counts3"] = X_train4.Negative_counts**3
X_test4["Positive_counts3"] = X_test4.Positive_counts**3
X_test4["Negative_counts3"] = X_test4.Negative_counts**3
X_train_sd4 = (X_train4 - X_train4.mean())/(X_train4.std())
X_test_sd4 = (X_test4 - X_train4.mean())/(X_train4.std())

X_train_sd4["ones"] = [1]*len(y_train)
X_test_sd4["ones"] = [1]*len(y_test)

In [38]:
start_time = timeit.default_timer()

weight_sd4 = soft_SVM_training(X_train_sd4,
                  y_train,
                  150,
                  np.array(X_train_sd4.mean()),
                  10**-6,
                  0.0001,
                  10000,
                  1000)
acc = Testing_soft_SVM(X_test_sd4,y_test,weight_sd4)
print("Accuracy:",acc)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.0010921768503837303
Accuracy: 0.73484
Time (minutes) elapsed for this cell: 0.6281452981163359


In [40]:
vals = X_test_sd4.values.dot(weight_sd4)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,9110,3239
Predicted Negative,3390,9261


**Hardly any improvement adding polynomial terms**

### Trying different values of C (using quadratic, because it has the best accuracy)

In [49]:
start_time = timeit.default_timer()
Cs = [1,5,10,25,50,150,350,500,1000,1500,1750]
accs = []
times = []
weights = []
for c in Cs:
    weight_sd3 = soft_SVM_training(X_train_sd3,
                      y_train,
                      c,
                      np.array(X_train_sd3.mean()),
                      10**-6,
                      0.0001,
                      20000,
                      1000)
    acc = Testing_soft_SVM(X_test_sd3,y_test,weight_sd3)
    accs.append(acc)
    times.append(timeit.default_timer() - start_time)
    weights.append(weight_sd3)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 1.6725249419809327e-05
w_n - w_(n-1): 1.3351537340869115e-05
w_n - w_(n-1): 6.216604409153545e-05
w_n - w_(n-1): 6.682480799257601e-05
w_n - w_(n-1): 0.00021149042920293366
w_n - w_(n-1): 0.0003785497423617887
w_n - w_(n-1): 0.001958289691359827
w_n - w_(n-1): 0.002825558128312623
w_n - w_(n-1): 0.006589458081403636
w_n - w_(n-1): 0.015574972627845803
w_n - w_(n-1): 0.0027753611496224434
Time (minutes) elapsed for this cell: 13.035402092133397


In [50]:
pd.DataFrame({"C":Cs,"Accuracy":accs,"Time (minutes)":times})

Unnamed: 0,C,Accuracy,Time (minutes)
0,1,0.6672,72.127495
1,5,0.72984,144.054233
2,10,0.72976,215.80079
3,25,0.7306,287.084302
4,50,0.7328,357.944615
5,150,0.7352,428.55633
6,350,0.73468,499.126529
7,500,0.73472,569.774117
8,1000,0.7344,640.396868
9,1500,0.73476,711.363786


In [51]:
maximum = accs.index(max(accs))
vals = X_test_sd3.values.dot(weights[maximum])
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,9108,3228
Predicted Negative,3392,9272


Had the same accuracy even when the difference in error between two sequential iterations was more than 100

# Training and Test (Feature Test 2)

In [7]:
start_time = timeit.default_timer()

df_train = pd.read_csv("train2.csv").drop("Unnamed: 0",axis=1)
df_test = pd.read_csv("test2.csv").drop("Unnamed: 0",axis=1)

elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

Time (minutes) elapsed for this cell: 0.4243882813830472


In [8]:
# remove columns that are too sparse, allows LDA to perform matrix algebra
cols_to_drop = []
df = df_train.drop("Label", axis=1)
for col in df:
    num_vals = len(df) - df[col].value_counts()[0]
    if num_vals < 3:
        cols_to_drop.append(col)
len(cols_to_drop)

1039

In [9]:
df_train.drop(cols_to_drop, axis=1, inplace=True)
df_test.drop(cols_to_drop, axis=1, inplace=True)

X_train = df_train.drop("Label", axis=1)
y_train = df_train["Label"]
X_test = df_test.drop("Label", axis=1)
y_test = df_test["Label"]

In [10]:
X_train["ones"] = [1]*len(y_train)
X_test["ones"] = [1]*len(y_test)

### SVM Classifier results

In [16]:
start_time = timeit.default_timer()

w = soft_SVM_training(X_train,
                      y_train,
                      25,
                      np.array(X_train.mean()),
                      10**-6,
                      0.001,
                      5000,
                      500)
acc = Testing_soft_SVM(X_test,y_test,w)
elapsed = timeit.default_timer() - start_time
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.0057922527807800495
Time (minutes) elapsed for this cell: 3.630121353699845


In [18]:
print("accuracy:", acc)

accuracy: 0.82748


In [17]:
vals = X_test.values.dot(w)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,10781,2594
Predicted Negative,1719,9906


In [19]:
start_time = timeit.default_timer()

w = soft_SVM_training(X_train,
                      y_train,
                      150,
                      np.array(X_train.mean()),
                      10**-6,
                      0.001,
                      5000,
                      500)
acc = Testing_soft_SVM(X_test,y_test,w)
elapsed = timeit.default_timer() - start_time
print("accuracy:", acc)
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.031873037480976765
accuracy: 0.85988
Time (minutes) elapsed for this cell: 3.5533360773998237


In [20]:
vals = X_test.values.dot(w)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,11059,2062
Predicted Negative,1441,10438


In [21]:
start_time = timeit.default_timer()

w = soft_SVM_training(X_train,
                      y_train,
                      500,
                      np.array(X_train.mean()),
                      10**-6,
                      0.001,
                      5000,
                      500)
acc = Testing_soft_SVM(X_test,y_test,w)
elapsed = timeit.default_timer() - start_time
print("accuracy:", acc)
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.08510358954530906
accuracy: 0.87396
Time (minutes) elapsed for this cell: 3.5169638187498395


In [22]:
vals = X_test.values.dot(w)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,11058,1709
Predicted Negative,1442,10791


In [27]:
start_time = timeit.default_timer()

w = soft_SVM_training(X_train,
                      y_train,
                      1500,
                      np.array(X_train.mean()),
                      10**-6,
                      0.0001,
                      10000,
                      500)
acc = Testing_soft_SVM(X_test,y_test,w)
elapsed = timeit.default_timer() - start_time
print("accuracy:", acc)
print("Time (minutes) elapsed for this cell:", elapsed/60)

w_n - w_(n-1): 0.022950703553421728
accuracy: 0.88276
Time (minutes) elapsed for this cell: 6.974603124033698


In [28]:
vals = X_test.values.dot(w)
predictions = pd.Series((vals/abs(vals)).astype(int))
confusion_matrix = produce_confusion_matrix(predictions, y_test)
confusion_matrix

Unnamed: 0,Actually Positive,Actually Negative
Prediced Positive,11136,1567
Predicted Negative,1364,10933
