In [None]:
import pandas as pd
import random,time,csv
import numpy as np
import math,copy,os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
from statistics import mode

k_odd = 11
k_even = 6
algorithm = 'kd_tree'
metric = 'euclidean'

# Data preprocessing

In [None]:
## Load dataset
from sklearn import preprocessing
df_orig = pd.read_csv('dataset/adult.data.csv',na_values = ' ?')

## Drop NULL values
df_orig = df_orig.dropna(how='any',axis=0)

## For testing
df = copy.deepcopy(df_orig)

df = pd.concat([df,pd.get_dummies(df['workclass'], prefix='workclass')],axis=1)
df = pd.concat([df,pd.get_dummies(df['education'], prefix='education')],axis=1)
df = pd.concat([df,pd.get_dummies(df['marital-status'], prefix='marital-status')],axis=1)
df = pd.concat([df,pd.get_dummies(df['occupation'], prefix='occupation')],axis=1)
df = pd.concat([df,pd.get_dummies(df['relationship'], prefix='relationship')],axis=1)



df.drop(['workclass','education','marital-status','occupation','relationship','native-country'],axis=1, inplace=True)


## Change symbolics to numerics
df['sex'] = np.where(df['sex'] == ' Male', 1, 0)
df['race'] = np.where(df['race'] != ' White', 0, 1)
df['Probability'] = np.where(df['Probability'] == ' <=50K', 0, 1)


## Discretize age
df['age'] = np.where(df['age'] >= 70, 70, df['age'])
df['age'] = np.where((df['age'] >= 60 ) & (df['age'] < 70), 60, df['age'])
df['age'] = np.where((df['age'] >= 50 ) & (df['age'] < 60), 50, df['age'])
df['age'] = np.where((df['age'] >= 40 ) & (df['age'] < 50), 40, df['age'])
df['age'] = np.where((df['age'] >= 30 ) & (df['age'] < 40), 30, df['age'])
df['age'] = np.where((df['age'] >= 20 ) & (df['age'] < 30), 20, df['age'])
df['age'] = np.where((df['age'] >= 10 ) & (df['age'] < 10), 10, df['age'])
df['age'] = np.where(df['age'] < 10, 0, df['age'])



from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df = pd.DataFrame(scaler.fit_transform(df),columns = df.columns)
df.shape

# Train test split

In [None]:
df_train, df_test = train_test_split(df, test_size=0.2, shuffle = True)

X_train, y_train = df_train.loc[:, df_train.columns != 'Probability'], df_train['Probability']


X_test , y_test = df_test.loc[:, df_test.columns != 'Probability'], df_test['Probability']

clf = LogisticRegression()

clf.fit(X_train, y_train)

# Feature importance
# import matplotlib.pyplot as plt
# y = np.arange(len(df.columns)-1)
# plt.barh(y,clf.coef_[0])
# plt.yticks(y,df.columns)
# plt.show()

# print(clf.coef_[0])

y_pred = clf.predict(X_test)
cnf_matrix = confusion_matrix(y_test, y_pred)

print(cnf_matrix)
print(classification_report(y_test, y_pred))

# Train unsupervised KDTree on training data

In [None]:
from sklearn.neighbors import KDTree
from sklearn.neighbors import NearestNeighbors
import numpy as np


nbrs = NearestNeighbors(n_neighbors=k_even, algorithm=algorithm, metric=metric).fit(X_train)


# ['auto', 'ball_tree', 'kd_tree', 'brute']
#[“euclidean”,“manhattan”,“chebyshev”,“minkowski”,“wminkowski”,“seuclidean”,“mahalanobis”]

In [None]:
df_test.drop(['Probability'],axis=1, inplace=True)

def knnPred(df_orig,index_list,y_pred):
    df_explain = df_orig.iloc[index_list,:]
    df_explain['Probability']  = np.where(df_explain['Probability'] == ' >50K', 1, 0)
    prabibilty_list = df_explain['Probability']    
    knn_pred = mode(prabibilty_list) ## Majority Voting         
    if (y_pred[0] != knn_pred):
        return df_explain
    else:
        return "OK"
        
           
count = 0

for index,row in df_test.iterrows():    
    row_ = [row.values]    
    y_pred = clf.predict(row_)        
    if y_pred[0] == 0:
        distances, knn_list = nbrs.kneighbors(row_)
        index_list = np.append(knn_list[0],index)            
        result = knnPred(df_orig,index_list,y_pred)
        if result is not "OK":
            csv_name = "Adult\euclidean_Adult"+str(count)+".csv"
            result.to_csv(csv_name)
            count += 1

print(count)