In [23]:
import numpy as np
import pickle
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from scipy.stats import uniform
from scipy.stats import norm
 
from sklearn.model_selection import RandomizedSearchCV
from sklearn import metrics

In [24]:
with open('MD_freqFeats.tup', 'rb') as f:
    data = pickle.load(f)
x_all = data[0]
y_all = data[1]

In [14]:
# Designate distributions to sample hyperparameters from 
np.random.seed(123)
g_range = np.random.uniform(0.0, 0.3, 5).astype(float)
C_range = np.random.normal(1, 0.1, 5).astype(float)
 
# Check that gamma>0 and C>0 
C_range[C_range < 0] = 0.0001
 
hyperparameters = {'gamma': list(g_range), 
                    'C': list(C_range)}
# Run randomized search
randomCV = RandomizedSearchCV(SVC(kernel='rbf', ), param_distributions=hyperparameters, n_iter=10, cv = 3, scoring = 'f1' verbose = True)
randomCV.fit(x_all, y_all)
 
# Identify optimal hyperparameter values
best_gamma  = randomCV.best_params_['gamma']
best_C      = randomCV.best_params_['C']
 
print("The best performing gamma value is: {:5.2f}".format(best_gamma))
print("The best performing C value is: {:5.2f}".format(best_C))

Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:  5.2min finished


The best performing gamma value is:  0.09
The best performing C value is:  0.99


In [15]:
print(randomCV.best_score_)

0.4247093023255814


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

In [26]:
lams = [0.000001,0.00001,0.0001,0.001,0.01,0.1,1,5,10,15]
acc = []
for lam in lams:
    classifier = LogisticRegression(C=1/lam, solver = 'lbfgs')
    acc.append(np.mean(cross_val_score(classifier,x_all,y_all, cv = 5, scoring = 'f1')))
    print(lam)
print(acc)

1e-06
1e-05
0.0001
0.001
0.01
0.1
1
5
10
15
[0.7782463928967814, 0.7782463928967814, 0.7782463928967814, 0.7782463928967814, 0.7782463928967814, 0.7782463928967814, 0.7782463928967814, 0.7782463928967814, 0.7782463928967814, 0.7782463928967814]


In [27]:
from sklearn.model_selection import train_test_split
classifier = LogisticRegression(C = 1/0.01, solver = 'lbfgs')
x_train,x_test,y_train,y_test = train_test_split(x_all,y_all, test_size = 0.3)

In [28]:
classifier.fit(x_train,y_train)
y_pred = classifier.predict(x_test)

In [30]:
from sklearn.metrics import f1_score, precision_score, recall_score
f1 = f1_score(y_test,y_pred)
precision = precision_score(y_test,y_pred)
recall = recall_score(y_test,y_pred)
print("precision = {}, recall = {}, f1 = {}".format(precision,recall,f1))

precision = 0.6402616279069767, recall = 1.0, f1 = 0.7806823216659281


[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
