In [1]:
import random
import math
import csv
import time
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score,top_k_accuracy_score
from sklearn.calibration import CalibratedClassifierCV

In [2]:
numexpr=10000
noise=0.001
csv_path=f"./data/train_data{numexpr}_noise{noise}.csv"
df = pd.read_csv(csv_path,index_col=0)

In [3]:
#Concatenate mean diameter and s column into a single label column
df['distribution'] = 'mean'+df['mean'].map(str) +'sd'+ df['sd'].map(str) 
df=df.drop(['mean', 'sd'], axis=1)

In [4]:
df.tail()

Unnamed: 0,kpoint-127.0,kpoint-126.0,kpoint-125.0,kpoint-124.0,kpoint-123.0,kpoint-122.0,kpoint-121.0,kpoint-120.0,kpoint-119.0,kpoint-118.0,...,kpoint-8.0,kpoint-7.0,kpoint-6.0,kpoint-5.0,kpoint-4.0,kpoint-3.0,kpoint-2.0,kpoint-1.0,kpoint0.0,distribution
17994,0.951431,1.486857,2.093037,0.286452,3.15233,1.020971,2.434849,3.225991,0.855807,2.042384,...,22.560598,19.833429,14.886528,25.802806,39.572794,26.891573,17.691009,68.548991,38.274064,mean10.0sd2.0
17995,1.507925,1.795566,2.544967,0.650658,2.363569,1.089463,1.978059,2.409336,1.968731,3.118284,...,34.868134,47.609525,15.939395,8.790397,11.026781,23.102719,15.421562,26.304429,32.457574,mean10.0sd2.0
17996,3.876576,1.167043,0.809928,1.363216,3.255415,2.998341,1.35259,2.126336,1.640195,2.590011,...,37.797992,42.910913,29.228842,35.875462,11.859695,54.81282,14.787574,43.03028,44.739941,mean10.0sd2.0
17997,1.319944,0.459081,1.416416,1.41484,3.819048,1.565074,1.113632,1.261733,1.791418,4.252774,...,32.460489,30.481559,8.638896,33.955203,20.359816,24.507833,35.966531,29.612428,25.880349,mean10.0sd2.0
17998,2.161387,0.449817,1.25715,3.934188,2.8234,2.861539,2.461454,0.600201,3.103224,3.805818,...,12.973458,31.611631,43.006722,25.355662,82.323493,62.033148,68.416039,46.474658,16.018643,mean10.0sd2.0


In [5]:
df['distribution'].nunique()

180

In [6]:
df['distribution'].unique()

array(['mean0.5sd0.0', 'mean1.0sd0.0', 'mean1.5sd0.0', 'mean2.0sd0.0',
       'mean2.5sd0.0', 'mean3.0sd0.0', 'mean3.5sd0.0', 'mean4.0sd0.0',
       'mean4.5sd0.0', 'mean5.0sd0.0', 'mean5.5sd0.0', 'mean6.0sd0.0',
       'mean6.5sd0.0', 'mean7.0sd0.0', 'mean7.5sd0.0', 'mean8.0sd0.0',
       'mean8.5sd0.0', 'mean9.0sd0.0', 'mean9.5sd0.0', 'mean10.0sd0.0',
       'mean0.5sd0.013', 'mean1.0sd0.025', 'mean1.5sd0.038',
       'mean2.0sd0.05', 'mean2.5sd0.062', 'mean3.0sd0.075',
       'mean3.5sd0.088', 'mean4.0sd0.1', 'mean4.5sd0.113',
       'mean5.0sd0.125', 'mean5.5sd0.138', 'mean6.0sd0.15',
       'mean6.5sd0.163', 'mean7.0sd0.175', 'mean7.5sd0.188',
       'mean8.0sd0.2', 'mean8.5sd0.213', 'mean9.0sd0.225',
       'mean9.5sd0.238', 'mean10.0sd0.25', 'mean0.5sd0.025',
       'mean1.0sd0.05', 'mean1.5sd0.075', 'mean2.0sd0.1',
       'mean2.5sd0.125', 'mean3.0sd0.15', 'mean3.5sd0.175',
       'mean4.0sd0.2', 'mean4.5sd0.225', 'mean5.0sd0.25',
       'mean5.5sd0.275', 'mean6.0sd0.3', 'mean6

In [7]:
#Split label from features
X = df.drop('distribution',axis=1)
y = df['distribution']

In [8]:
#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=101)

In [9]:
#Scale data
scaler = MinMaxScaler()
scaled_X_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)

In [None]:
#Define svc
svc = SVC()

In [None]:
#Define grid search parameters
param_grid = {'kernel':['linear', 'poly', 'rbf'],'C':[0.01,0.05,0.1,0.25,0.5,0.75,1,2.5,5],'gamma':['scale','auto']}
grid = GridSearchCV(svc,param_grid,n_jobs=6,cv=5)

In [None]:
#Run Grid search
grid.fit(scaled_X_train,y_train)

In [None]:
#Display best parameter combination
grid.best_params_

In [None]:
#Calcualte predictions based on test data
grid_pred = grid.predict(scaled_X_test)

In [None]:
#Display accuracy
accuracy_score(y_test, grid_pred)

In [None]:
scaled_X_test[30]

In [22]:
y_test.iloc[30]


'mean6.5sd0.975'

In [None]:
y_pred=grid.predict(scaled_X_test[30].reshape(1, -1))
y_pred

In [10]:
#Define final SVC
svc_fin=SVC(C= 5, gamma= 'scale', kernel= 'rbf',probability=True)

In [11]:
#Fit final svc
svc_fin.fit(scaled_X_train,y_train)

SVC(C=5, probability=True)

In [12]:
prob_predict=svc_fin.predict_proba(scaled_X_test[30].reshape(1, -1))
prob_predict

array([[0.00026111, 0.0002625 , 0.00026367, 0.00026798, 0.00026945,
        0.00027514, 0.00027641, 0.00028213, 0.00028862, 0.00031029,
        0.00030847, 0.00031126, 0.0002922 , 0.00031356, 0.00031339,
        0.00031   , 0.00031282, 0.00030761, 0.00029833, 0.00029668,
        0.00030148, 0.00030348, 0.00029457, 0.00029141, 0.00030161,
        0.00029699, 0.00032608, 0.000399  , 0.00049026, 0.00084275,
        0.00103364, 0.0008093 , 0.00071603, 0.00126826, 0.00070912,
        0.00056621, 0.00026443, 0.00026399, 0.00027676, 0.00030387,
        0.00033551, 0.00033302, 0.00036132, 0.00035472, 0.00044631,
        0.00040344, 0.00034892, 0.00040431, 0.00039782, 0.00039891,
        0.0004375 , 0.0003767 , 0.00044044, 0.00047902, 0.00036084,
        0.00034402, 0.0003342 , 0.00037163, 0.00039547, 0.00044264,
        0.00047071, 0.00046227, 0.00058362, 0.00036218, 0.00034972,
        0.00034735, 0.00033813, 0.00040276, 0.00047885, 0.0006583 ,
        0.00083922, 0.00106001, 0.00029414, 0.00

In [13]:
svc_fin.predict(scaled_X_test[30].reshape(1, -1))

array(['mean6.5sd0.975'], dtype=object)

In [23]:
classes = svc_fin.classes_
classes

array(['mean0.5sd0.0', 'mean0.5sd0.013', 'mean0.5sd0.025',
       'mean0.5sd0.037', 'mean0.5sd0.05', 'mean0.5sd0.062',
       'mean0.5sd0.075', 'mean0.5sd0.087', 'mean0.5sd0.1', 'mean1.0sd0.0',
       'mean1.0sd0.025', 'mean1.0sd0.05', 'mean1.0sd0.075',
       'mean1.0sd0.1', 'mean1.0sd0.125', 'mean1.0sd0.15',
       'mean1.0sd0.175', 'mean1.0sd0.2', 'mean1.5sd0.0', 'mean1.5sd0.038',
       'mean1.5sd0.075', 'mean1.5sd0.112', 'mean1.5sd0.15',
       'mean1.5sd0.188', 'mean1.5sd0.225', 'mean1.5sd0.262',
       'mean1.5sd0.3', 'mean10.0sd0.0', 'mean10.0sd0.25', 'mean10.0sd0.5',
       'mean10.0sd0.75', 'mean10.0sd1.0', 'mean10.0sd1.25',
       'mean10.0sd1.5', 'mean10.0sd1.75', 'mean10.0sd2.0', 'mean2.0sd0.0',
       'mean2.0sd0.05', 'mean2.0sd0.1', 'mean2.0sd0.15', 'mean2.0sd0.2',
       'mean2.0sd0.25', 'mean2.0sd0.3', 'mean2.0sd0.35', 'mean2.0sd0.4',
       'mean2.5sd0.0', 'mean2.5sd0.062', 'mean2.5sd0.125',
       'mean2.5sd0.188', 'mean2.5sd0.25', 'mean2.5sd0.312',
       'mean2.5sd

In [14]:
pred = svc_fin.predict_proba(scaled_X_test[30].reshape(1, -1))
label = np.argmax(pred, axis=1)[0]
classes = svc_fin.classes_

print(classes[label])

mean7.5sd0.562


In [15]:
np.argmax(pred, axis=1)[0]

138

In [16]:
svc_fin.score(scaled_X_test, y_test)

0.16111111111111112

In [17]:
prob_predict=svc_fin.predict_proba(scaled_X_test)

In [21]:
top_k_accuracy_score(y_test, prob_predict, k=9)

0.74