In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('../../Visualization/OASIS/oasis_3.csv')
df = df.dropna(axis=1, how='all') # Drop any empty columns
df = df.dropna(axis=0, how='any') # Drop any rows with empty values 
df = df.rename(columns={'id':'Freesurfer ID', 'dx1':'Diagnosis', 
                        'TOTAL_HIPPOCAMPUS_VOLUME':'TotalHippocampusVol'}) # Rename columns
df = df.drop_duplicates(subset='Subject', keep='first') # Keep only the first visit; this is possible because
                                                        # df is sorted by age
df = df.reset_index(drop=True) # Reset the index
df = df.set_index('Subject')
cols = df.columns.tolist()
cols[2], cols[4] = cols[4], cols[2]
df = df[cols]
df.loc[df['cdr'] < 0.5, 'Diagnosis'] = 'control'
df.loc[~(df['cdr'] < 0.5), 'Diagnosis'] = 'dementia'
df.loc[df['Diagnosis'] == 'control', 'Diagnosis'] = -1
df.loc[df['Diagnosis'] == 'dementia', 'Diagnosis'] = 1
df = df.drop(['MR ID', 'Freesurfer ID', 'M/F', 'cdr'], axis=1) # Drop categorical and redundant columns
df = df.drop(['lhCortexVol', 'rhCortexVol', 'lhCorticalWhiteMatterVol', 'rhCorticalWhiteMatterVol', 'L.SurfArea', 'R.SurfArea'], axis=1) # Test drop to reduce dimensionality

In [3]:
X = df.drop(['Diagnosis'], axis=1)
y = df['Diagnosis']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [4]:
# standard z score scaling
def scale(X):
    u = np.mean(X)
    s = np.std(X)
    X_scaled = (X-u)/s
    return X_scaled

In [5]:
X_train = scale(X_train)
X_test = scale(X_test)

In [6]:
for i, col in enumerate(X_train):
    print(col, i)

Age 0
mmse 1
apoe 2
TotalHippocampusVol 3
IntraCranialVol 4
CortexVol 5
SubCortGrayVol 6
TotalGrayVol 7
SupraTentorialVol 8
CorticalWhiteMatterVol 9


In [7]:
class SVM(object):
    
    def fit(self, X, y):
        #train with data
        self.X = X
        self.y = y
        # { |\w\|:{w,b}}
        opt_dict = {}
                    
        self.max_feature_value = max(X.max(axis=0))         
        self.min_feature_value = min(X.min(axis=0))  
        
        #with smaller steps our margins and db will be more precise
        step_sizes = [self.max_feature_value * 0.1,
                      self.max_feature_value * 0.01,
                      #point of expense
                      self.max_feature_value * 0.001,]
        
        #extremly expensise
        b_range_multiple = 5
        #we dont need to take as small step as w
        b_multiple = 5
        
        latest_optimum = self.max_feature_value*10
        
        """
        objective is to satisfy yi(x.w)+b>=1 for all training dataset such that ||w|| is minimum
        for this we will start with random w, and try to satisfy it with making b bigger and bigger
        """
        #making step smaller and smaller to get precise value
        for step in step_sizes:
            w = np.full(shape=X.shape[1], fill_value=latest_optimum)
            
            #we can do this because convex
            optimized = False
            while not optimized:
                for b in np.arange(-1*self.max_feature_value*b_range_multiple,
                                   self.max_feature_value*b_range_multiple,
                                   step*b_multiple):
                    #w_t = w*transformation
                    found_option = True
                    w_t = w

                    #weakest link in SVM fundamentally
                    #SMO attempts to fix this a bit
                    # ti(xi.w+b) >=1
                    for index, row in X.iterrows():
                        yi = y[index]
                        if not abs(yi*(np.dot(w_t,row)+b))>=1:
                            #print(yi*(np.dot(w_t,row)+b))
                            found_option=False
                    if found_option:
                        """
                        all points in dataset satisfy y(w.x)+b>=1 for this current w_t, b
                        then put w,b in dict with ||w|| as key
                        """
                        opt_dict[np.linalg.norm(w_t)]=[w_t,b]
                        #print([w_t,b])
                
                #after w[0] or w[1]<0 then values of w starts repeating itself because of transformation
                #Think about it, it is easy
                #print(w,len(opt_dict)) Try printing to understand
                if w[0]<0:
                    optimized=True
                    #print("optimized a step")
                else:
                    w = w-step
                    
            # sorting ||w|| to put the smallest ||w|| at poition 0 
            norms = sorted([n for n in opt_dict])
            #optimal values of w,b
            opt_choice = opt_dict[norms[0]]

            self.w=opt_choice[0]
            self.b=opt_choice[1]
            
            #start with new latest_optimum (initial values for w)
            latest_optimum = opt_choice[0][0]+step*2
    
    def predict(self,features):
        #sign(x.w+b)
        classification = np.sign(np.dot(np.array(features),self.w)+self.b)
        return (classification,np.dot(np.array(features),self.w)+self.b)

In [8]:
model = SVM()
model.fit(X_train, y_train)

In [19]:
predictions = model.predict(X_test)[0]
print(np.sum(predictions != np.array(y_test)))
print(y_test.shape)

365
(511,)
