In [79]:
from csv import DictReader
import numpy as np

from sklearn import svm
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import GridSearchCV

In [80]:
train_csv = 'layer_features/adc/features_train_combined.csv'
test_csv = 'layer_features/adc/features_test_conbined.csv'
n_estimators = 400

def to1hot(zone):
    zones = ['AS', 'PZ', 'SV', 'TZ']
    
    result = [float(zone == e) for e in zones]
    return result
    

train_data = []
train_labels = []
train_proxIds = []

with open(train_csv) as csvfile:
    reader = DictReader(csvfile)
    fields = list(reader.fieldnames)
    fields.remove('proxid')
    fields.remove('clinsig')
    fields.remove('Age') # because convert to float.
    fields.remove('Zone') # categorical data -> 1-hot
    
    for row in reader:
        train_proxIds.append(row.pop('proxid'))
        train_labels.append(row.pop('clinsig'))
        data_item= []
        for field in fields:
            data_item.append(row[field])
        data_item.append(float(row['Age'][:-1]) / 10)
        data_item.extend(to1hot(row['Zone']))
        train_data.append(data_item)
        

test_data = []
test_proxIds = []

with open(test_csv) as csvfile:
    reader = DictReader(csvfile)
    fields = list(reader.fieldnames)
    fields.remove('proxid')
    fields.remove('Age') # because convert to float.
    fields.remove('Zone') # categorical data -> 1-hot
    
    for row in reader:
        test_proxIds.append(row.pop('proxid'))
        data_item= []
        for field in fields:
            data_item.append(row[field])
        data_item.append(float(row['Age'][:-1]) /10)
        data_item.extend(to1hot(row['Zone']))
        test_data.append(data_item)

In [81]:
C_range = np.logspace(-30, 10, 60)
gamma_range = np.logspace(-30, 3, 60)
param_grid = dict(gamma=gamma_range, C=C_range)
cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=42)
grid = GridSearchCV(svm.SVC(), param_grid=param_grid, cv=cv)
grid.fit(train_data, train_labels)

print("The best parameters are %s with a score of %0.2f"
      % (grid.best_params_, grid.best_score_))

The best parameters are {'gamma': 0.0025514065200312819, 'C': 15.361749466718233} with a score of 0.81


In [82]:
clf = svm.SVC(C=15.361749466718233, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.0025514065200312819, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
clf.fit(train_data, train_labels)

SVC(C=15.361749466718233, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=0.002551406520031282,
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [83]:
test_result = clf.predict_proba(test_data)

In [84]:
for item in test_result:
    print(item[1])

0.0472838316313
0.686111105926
0.473850391351
0.315857051573
0.293350257313
0.193653706784
0.197896988001
0.0829525371187
0.170920314184
0.819208105615
0.31026051972
0.552360280576
0.709872253483
0.474453934374
0.135731461496
0.228956716304
0.106783962354
0.207747272972
0.141715787483
0.373314203772
0.399101333891
0.210459434322
0.515958154716
0.24961359885
0.590902764745
0.0568319291234
0.13998813396
0.0612279022516
0.138228424388
0.805681848187
0.194449293767
0.224453548269
0.115210474502
0.143762588261
0.142230021831
0.20502662687
0.879932584288
0.894386792042
0.494423852399
0.0545571022866
0.093493842585
0.187935124898
0.274152490912
0.409428455482
0.291369589023
0.0994192600996
0.095067065525
0.0576115091341
0.134577370326
0.0381664053319
0.0777738347547
0.184242610114
0.0404269404408
0.32980444966
0.180963211474
0.106486295786
0.21232214184
0.117517348991
0.133149548318
0.143726925251
0.1906223659
0.110399419524
0.0757882834354
0.115428560184
0.245155430087
0.201597345681
0.08289