### CS M145 Project

## Loading Essentials and Helper Functions 

In [1]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

import os
import random


random.seed(148) 

## Load DataSets


In [2]:
df = pd.read_csv('train.csv')
test_features_raw = pd.read_csv('test.csv')

## Process and Split Data

In [3]:
train_labels = df['Disease']
train_features_raw = df.drop(columns = ['Disease'])

In [4]:
train_symptoms_set = []
for c in train_features_raw.columns:
    train_symptoms_set += list(pd.unique(train_features_raw[c]).astype('str'))
train_symptoms_set = set([s.strip() for s in train_symptoms_set]) # trim the words of whitespace and then make it a set
train_symptoms_set.remove('nan')
symptom_to_id = {val: ind for ind, val in enumerate(list(train_symptoms_set))}
len(train_symptoms_set)

131

In [5]:
# testing if how much of the symptoms in the test are included in training data
test_symptoms_set = []
for c in test_features_raw.columns:
    test_symptoms_set += list(pd.unique(test_features_raw[c]).astype('str'))
test_symptoms_set = set([s.strip() for s in test_symptoms_set]) # trim the words of whitespace and then make it a set
test_symptoms_set.remove('nan')
train_symptoms_set.issubset(test_symptoms_set)

True

In [6]:
def transform_feature(dataframe):
    '''
    takes in a dataframe (symptoms only, no label)
    for each entry, look through all of its symptoms, if the symptoms is in the training set, 
    set the corresponding column to 1 (otherwise 0)
    '''
    # first map all symptoms to its id
    #transformed_df = dataframe.apply(lambda x: symptom_to_id.get(x, None))
    # use the ids to set the appropriate zeros
    def subfunction(row):
        '''
        first maps each row to its id, then
        returns a numpy array of length symptoms_to_id with elements at id set to 1
        and 0 elsewhere
        '''
        row = row.to_numpy().astype(str)
        ids = np.array([symptom_to_id.get(s.strip(), None) for s in row])
        ans = np.zeros(len(symptom_to_id))
        ans[ids[ids != np.array(None)].astype(int)] = 1
        return ans
    transformed_df = dataframe.apply(subfunction, axis=1)
    return transformed_df

In [7]:
train_features_transformed = np.stack(transform_feature(train_features_raw).to_numpy())
test_features_transformed = np.stack(transform_feature(test_features_raw).to_numpy())

`train_features_transformed` would contain the transformed train features (used for training), while `test_features_transformed` would contain the transformed test features (used for testing).

## kNN Cross Validation

In [22]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

for i in range(1, 18):
    clf = KNeighborsClassifier(n_neighbors = i)
    scores = cross_val_score(clf, train_features_transformed, train_labels, cv=5)
    print("{:2f}:{}".format(i, scores))



1.000000:[1. 1. 1. 1. 1.]
2.000000:[1. 1. 1. 1. 1.]
3.000000:[1. 1. 1. 1. 1.]
4.000000:[1. 1. 1. 1. 1.]
5.000000:[1. 1. 1. 1. 1.]
6.000000:[1.         0.98734177 0.98734177 0.98734177 0.98717949]




7.000000:[1.         0.98734177 0.98734177 0.98734177 0.98717949]
8.000000:[1.         0.97468354 0.97468354 0.98734177 0.98717949]
9.000000:[0.97468354 0.94936709 0.97468354 0.98734177 0.97435897]
10.000000:[0.94936709 0.91139241 0.97468354 0.98734177 0.94871795]
11.000000:[0.93670886 0.89873418 0.93670886 0.97468354 0.92307692]
12.000000:[0.91139241 0.86075949 0.89873418 0.96202532 0.91025641]




13.000000:[0.89873418 0.86075949 0.88607595 0.96202532 0.91025641]
14.000000:[0.89873418 0.86075949 0.88607595 0.93670886 0.88461538]
15.000000:[0.89873418 0.83544304 0.88607595 0.91139241 0.85897436]
16.000000:[0.87341772 0.82278481 0.88607595 0.88607595 0.79487179]
17.000000:[0.84810127 0.82278481 0.83544304 0.87341772 0.75641026]




## Cosine Similarity

In [52]:
# construct an average vector for all disease (normalized)
diseases = train_labels.unique().astype(str)
disease_to_avg_vector = {}
for disease in diseases:
    subset = df.loc[df['Disease'] == disease]
    subset_transformed = np.stack(transform_feature(subset).to_numpy()).sum(axis=0)
    subset_transformed /= np.linalg.norm(subset_transformed)
    disease_to_avg_vector[disease] = subset_transformed

In [55]:
def cos_clf(entry):
    # normalize the entry and dot it with the diseases
    # return the string with the highest dot product
    entry /= np.linalg.norm(entry)
    return diseases[np.argmax([np.dot(entry, disease_to_avg_vector[disease]) for disease in diseases])]
preds = [cos_clf(entry) for entry in test_features_transformed]
out = pd.DataFrame(preds, columns=['Disease'])
out.index += 1
out.to_csv("attempt.csv", index_label='ID')