In [1]:
import os
import numpy as np
import pandas as pd
from utils import *
from models import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.preprocessing import StandardScaler

# global variables
DATA_FOLDER = "machine-learning-with-kernel-methods-2021"
TRAIN_FILE = "Xtr{}.csv"
LABEL_FILE = "Ytr{}.csv"
TEST_FILE = "Xte{}.csv"
N = 3  # number of datasets

HIST_FILE = "results-history.txt"

baselines = {
    "ridge": kernelRidge,
    "svm": kernelSVM,
    "logistic": KernelLogistic,
}

In [39]:
def run(args):

    feature_type = args['features'].lower()
    fmaker = feature_extractor(
        feature_type=feature_type,
        k=args['k'],
        m=args['m'],
        lmbda=args['lmbda'],
        order_of_fourier_kmers=args['order_of_fourier_kmers'],
        nb_of_fourier_coeffs=args['order_of_fourier_kmers'],
    )

    kernel_type = args['kernel'].lower() #if feature_type == "bow" else "linear"
    
    clf = Baseline(
        baseline_type=args['baseline'].lower(), 
        kernel_type=kernel_type, 
        d=args['d'], 
        sigma=args['sigma'], 
        c=args['c'])
    
    index = []
    pred = []
    accs = []
    pred_val = []
    true_val = []
    for i in range(N):
        xtrain, ytrain, xte, ids = fmaker(i)

        scaler = StandardScaler()
        scaler.fit(xtrain)
        xtrain = scaler.transform(xtrain)
        xte = scaler.transform(xte)
        xtr, xval, ytr, yval = train_test_split(
            xtrain, ytrain, test_size=0.3, random_state=42
        )

        _ = clf.fit(xtr, ytr)
        predval = clf.predict(xtr, xval)
        print(confusion_matrix(yval, predval))
        
        acc = accuracy_score(yval, predval)
        print("accuracy is " + str(acc))
        ytest = clf.predict(xtr, xte)

        index.extend(ids)
        pred.extend(ytest)
        accs.append(acc)
        pred_val.append(predval)
        true_val.extend(yval)
        
    name = args['features'].lower() + "_k_" + str(args['k']) + clf.type
    save_file(index, pred, name=name)
    save_results(name, accs, sum(accs)/N)
    
    return pred, pred_val, true_val, index

In [17]:
params = dict({
    ("features", "spectrum"),
    ("baseline", "svm"),
    ("kernel", "gaussian"),
    ("k" , 6),
    ("m", 1),
    ("sigma", 0.1),
    ("lmbda", 2),
    ("d", 2),
    ("nb_of_fourier_coeffs",5),
    ("order_of_fourier_kmers",1),
    ("c", 0.1),
}
)

pred, pred_val, index = run(params)

     pcost       dcost       gap    pres   dres
 0: -4.1840e+02 -3.2477e+02  5e+03  2e+01  3e-16
 1: -1.3979e+02 -2.5402e+02  2e+02  3e-01  5e-16
 2: -1.2640e+02 -1.3753e+02  1e+01  3e-15  3e-16
 3: -1.2734e+02 -1.2760e+02  3e-01  1e-14  8e-17
 4: -1.2739e+02 -1.2740e+02  3e-03  9e-15  2e-16
 5: -1.2739e+02 -1.2739e+02  3e-05  2e-15  6e-17
Optimal solution found.
[[307   0]
 [293   0]]
accuracy is 0.5116666666666667
     pcost       dcost       gap    pres   dres
 0: -4.1888e+02 -3.2413e+02  5e+03  2e+01  8e-17
 1: -1.3959e+02 -2.5336e+02  2e+02  3e-01  2e-16
 2: -1.2797e+02 -1.3744e+02  9e+00  2e-15  5e-16
 3: -1.2935e+02 -1.2953e+02  2e-01  2e-14  1e-16
 4: -1.2939e+02 -1.2939e+02  2e-03  9e-15  2e-16
 5: -1.2939e+02 -1.2939e+02  2e-05  2e-15  4e-16
Optimal solution found.
[[279   0]
 [321   0]]
accuracy is 0.465
     pcost       dcost       gap    pres   dres
 0: -4.1788e+02 -3.2369e+02  5e+03  2e+01  2e-16
 1: -1.4256e+02 -2.5316e+02  2e+02  4e-01  2e-16
 2: -1.2614e+02 -1.4414e+02

In [18]:
params = dict({
    ("features", "spectrum"),
    ("baseline", "svm"),
    ("kernel", "linear"),
    ("k" , 5),
    ("m", 1),
    ("sigma", 0.01),
    ("lmbda", 2),
    ("d", 2),
    ("nb_of_fourier_coeffs",5),
    ("order_of_fourier_kmers",1),
    ("c", 0.01),
}
)

#spectrum_k_5_svm_c_0.01_linear 
pred, pred_val, index = run(params)

     pcost       dcost       gap    pres   dres
 0: -1.7585e+02 -3.7003e+01  9e+03  9e+01  3e-13
 1: -2.4151e+01 -3.0048e+01  6e+02  6e+00  3e-13
 2: -6.0799e+00 -2.6103e+01  7e+01  5e-01  4e-14
 3: -3.9772e+00 -1.7979e+01  2e+01  1e-01  1e-14
 4: -3.9434e+00 -6.2851e+00  2e+00  4e-03  6e-15
 5: -4.3437e+00 -5.1385e+00  8e-01  1e-03  6e-15
 6: -4.4954e+00 -4.7268e+00  2e-01  3e-04  6e-15
 7: -4.5430e+00 -4.5959e+00  5e-02  4e-05  6e-15
 8: -4.5544e+00 -4.5659e+00  1e-02  8e-06  7e-15
 9: -4.5572e+00 -4.5586e+00  1e-03  8e-07  7e-15
10: -4.5576e+00 -4.5576e+00  6e-05  3e-08  7e-15
11: -4.5576e+00 -4.5576e+00  2e-06  8e-10  8e-15
Optimal solution found.
[[191 116]
 [114 179]]
accuracy is 0.6166666666666667
     pcost       dcost       gap    pres   dres
 0: -1.7982e+02 -3.5600e+01  9e+03  1e+02  3e-13
 1: -2.0640e+01 -2.9957e+01  5e+02  5e+00  3e-13
 2: -5.7149e+00 -2.5627e+01  6e+01  5e-01  3e-14
 3: -3.6874e+00 -1.6688e+01  2e+01  9e-02  9e-15
 4: -3.6175e+00 -5.9332e+00  3e+00  7e-03 

In [19]:
params['k'] = 4

#spectrum_k_4_svm_c_0.01_linear 
pred1, pred_val1, index = run(params)

     pcost       dcost       gap    pres   dres
 0: -5.1919e+02 -3.6909e+01  1e+04  1e+02  3e-13
 1: -2.2126e+01 -3.6014e+01  3e+02  3e+00  3e-13
 2: -8.7470e+00 -3.0875e+01  4e+01  2e-01  3e-14
 3: -7.8954e+00 -1.3442e+01  6e+00  1e-16  5e-15
 4: -8.8783e+00 -1.0512e+01  2e+00  2e-16  4e-15
 5: -9.2431e+00 -9.8034e+00  6e-01  7e-17  5e-15
 6: -9.3599e+00 -9.5898e+00  2e-01  7e-17  4e-15
 7: -9.4184e+00 -9.4893e+00  7e-02  1e-16  5e-15
 8: -9.4380e+00 -9.4591e+00  2e-02  2e-16  4e-15
 9: -9.4448e+00 -9.4486e+00  4e-03  9e-17  5e-15
10: -9.4462e+00 -9.4465e+00  3e-04  1e-16  5e-15
11: -9.4464e+00 -9.4464e+00  1e-05  8e-17  5e-15
12: -9.4464e+00 -9.4464e+00  4e-07  1e-16  5e-15
Optimal solution found.
[[191 116]
 [108 185]]
accuracy is 0.6266666666666667
     pcost       dcost       gap    pres   dres
 0: -4.9919e+02 -3.5871e+01  1e+04  1e+02  3e-13
 1: -2.5493e+01 -3.4914e+01  4e+02  4e+00  3e-13
 2: -8.8542e+00 -3.0634e+01  5e+01  3e-01  2e-14
 3: -7.5408e+00 -1.5067e+01  8e+00  1e-03 

In [20]:
params['k'] = 4
params['features'] = "mismatch"

#spectrum_k_4_svm_c_0.01_linear 
pred2, pred_val2, index = run(params)

     pcost       dcost       gap    pres   dres
 0: -5.2579e+02 -3.7611e+01  1e+04  1e+02  9e-13
 1: -2.1703e+01 -3.6451e+01  3e+02  3e+00  8e-13
 2: -9.0274e+00 -3.1103e+01  4e+01  2e-01  6e-14
 3: -8.8713e+00 -1.2913e+01  4e+00  7e-03  1e-14
 4: -9.9305e+00 -1.0882e+01  1e+00  1e-03  1e-14
 5: -1.0149e+01 -1.0615e+01  5e-01  3e-04  1e-14
 6: -1.0298e+01 -1.0398e+01  1e-01  5e-05  1e-14
 7: -1.0330e+01 -1.0354e+01  2e-02  9e-06  2e-14
 8: -1.0339e+01 -1.0343e+01  4e-03  1e-06  2e-14
 9: -1.0340e+01 -1.0341e+01  3e-04  8e-08  2e-14
10: -1.0341e+01 -1.0341e+01  8e-06  2e-09  2e-14
Optimal solution found.
[[190 117]
 [104 189]]
accuracy is 0.6316666666666667
     pcost       dcost       gap    pres   dres
 0: -5.0738e+02 -3.6389e+01  1e+04  1e+02  8e-13
 1: -2.2645e+01 -3.4928e+01  3e+02  3e+00  8e-13
 2: -1.0199e+01 -3.0582e+01  6e+01  4e-01  1e-13
 3: -8.3167e+00 -1.9801e+01  1e+01  4e-02  2e-14
 4: -9.2488e+00 -1.1407e+01  2e+00  7e-03  1e-14
 5: -9.8958e+00 -1.0508e+01  7e-01  2e-03 

In [41]:
params['k'] = 3
params['features'] = "spectrum"

#spectrum_k_4_svm_c_0.01_linear 
pred3, pred_val3, true_val, index = run(params)

     pcost       dcost       gap    pres   dres
 0: -6.4666e+02 -3.3164e+01  9e+03  1e+02  2e-13
 1: -2.2439e+01 -3.2870e+01  2e+02  2e+00  2e-13
 2: -1.0195e+01 -2.7647e+01  2e+01  8e-02  9e-15
 3: -1.0723e+01 -1.3512e+01  3e+00  9e-03  3e-15
 4: -1.1531e+01 -1.2351e+01  9e-01  2e-03  2e-15
 5: -1.1774e+01 -1.2066e+01  3e-01  8e-04  3e-15
 6: -1.1864e+01 -1.1963e+01  1e-01  2e-04  3e-15
 7: -1.1889e+01 -1.1935e+01  5e-02  7e-05  3e-15
 8: -1.1903e+01 -1.1918e+01  1e-02  2e-16  3e-15
 9: -1.1909e+01 -1.1912e+01  3e-03  8e-17  3e-15
10: -1.1910e+01 -1.1910e+01  5e-04  8e-17  3e-15
11: -1.1910e+01 -1.1910e+01  1e-05  2e-16  3e-15
12: -1.1910e+01 -1.1910e+01  9e-07  2e-16  3e-15
Optimal solution found.
[[189 118]
 [118 175]]
accuracy is 0.6066666666666667
     pcost       dcost       gap    pres   dres
 0: -6.3352e+02 -3.4502e+01  1e+04  1e+02  2e-13
 1: -2.1282e+01 -3.4227e+01  2e+02  2e+00  2e-13
 2: -1.0398e+01 -2.9057e+01  3e+01  2e-01  2e-14
 3: -1.0090e+01 -1.3858e+01  4e+00  1e-16 

In [35]:
def vote(u, k=3):
    a = sum(u)
    a = (a> (k/2))*1
    return a

In [43]:
concat = np.concatenate((pred_val, pred_val1, pred_val2), axis=1)
concat.shape

v = vote(concat)
print(confusion_matrix(true_val, v))
acc = accuracy_score(true_val, v)
print(acc)

[[500 404]
 [386 510]]
0.5611111111111111


In [None]:
concat2 = np.concatenate((pred, pred1, pred2), axis=1)
concat2.shape

In [55]:
import pandas as pd
df = pd.read_csv(r'results\submission_spectrum_k_5_svm_c_0.0001_linear.csv')

In [56]:
df

Unnamed: 0.1,Unnamed: 0,Id,Bound
0,0,0,1
1,1,1,1
2,2,2,1
3,3,3,1
4,4,4,0
...,...,...,...
2995,2995,2995,1
2996,2996,2996,0
2997,2997,2997,0
2998,2998,2998,0


In [57]:
df = df[['Id', 'Bound']].copy()

In [59]:
df = df.set_index('Id')
df

Unnamed: 0_level_0,Bound
Id,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,0
...,...
2995,1
2996,0
2997,0
2998,0


In [60]:
df.to_csv('submission2.csv')

In [61]:
df2 = pd.read_csv('submission2.csv',index_col=0)
df2

Unnamed: 0_level_0,Bound
Id,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,0
...,...
2995,1
2996,0
2997,0
2998,0
