In [1]:
pip install kneed

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting kneed
  Downloading kneed-0.8.1-py2.py3-none-any.whl (10 kB)
Installing collected packages: kneed
Successfully installed kneed-0.8.1


In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from kneed import KneeLocator
from sklearn.cluster import KMeans
from pandas.io.parsers.readers import read_fwf
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics  import roc_auc_score,accuracy_score
import csv
from sklearn.preprocessing import OneHotEncoder

In [2]:
train = pd.read_csv('train_final.csv')
test = pd.read_csv('test_final.csv')
test = test.iloc[:,1:]
print(train.columns)
print(test.columns)

Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country',
       'income>50K'],
      dtype='object')
Index(['age', 'workclass', 'fnlwgt', 'education', 'education.num',
       'marital.status', 'occupation', 'relationship', 'race', 'sex',
       'capital.gain', 'capital.loss', 'hours.per.week', 'native.country'],
      dtype='object')


In [3]:
train = train.replace('?',np.NaN)
print(train.isna().sum())
test = test.replace('?',np.NaN)
print(test.isna().sum())

age                  0
workclass         1437
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1442
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     427
income>50K           0
dtype: int64
age                  0
workclass         1362
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1367
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     430
dtype: int64


In [4]:
train['workclass'] = train['workclass'].fillna(train['workclass'].mode()[0])
train['occupation'] = train['occupation'].fillna(train['occupation'].mode()[0])
train['native.country'] = train['native.country'].fillna(train['native.country'].mode()[0])

test['workclass'] = test['workclass'].fillna(test['workclass'].mode()[0])
test['occupation'] = test['occupation'].fillna(test['occupation'].mode()[0])
test['native.country'] = test['native.country'].fillna(test['native.country'].mode()[0])

In [5]:
train.drop(columns = ['education'],inplace=True)
test.drop(columns = ['education'],inplace=True)

In [6]:
train_categorical = train.select_dtypes(include=['object']).copy()
print("train: ", train_categorical.columns)

test_categorical = test.select_dtypes(include=['object']).copy()
print("test: ",test_categorical.columns)

train:  Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')
test:  Index(['workclass', 'marital.status', 'occupation', 'relationship', 'race',
       'sex', 'native.country'],
      dtype='object')


In [7]:
# labelencoder = LabelEncoder()
# encoded_data = train_categorical
# for category in train_categorical.columns:
#     encoded_data[category] = labelencoder.fit_transform(encoded_data[category])
# train_categorical = encoded_data
# print(train_categorical.head())

# encoded_test = test_categorical
# for category in test_categorical.columns:
#     encoded_test[category] = labelencoder.fit_transform(encoded_test[category])
# test_categorical = encoded_test
# print(test_categorical.head())

encoder = OneHotEncoder(handle_unknown = 'ignore')
encoder.fit(train_categorical)
train_categorical = pd.DataFrame(encoder.transform(train_categorical).toarray(), columns = encoder.get_feature_names_out())
print(train_categorical.head())

test_categorical = pd.DataFrame(encoder.transform(test_categorical).toarray(), columns = encoder.get_feature_names_out())
print(test_categorical.head())

   workclass_Federal-gov  workclass_Local-gov  workclass_Never-worked  \
0                    0.0                  0.0                     0.0   
1                    0.0                  0.0                     0.0   
2                    0.0                  0.0                     0.0   
3                    0.0                  0.0                     0.0   
4                    0.0                  0.0                     0.0   

   workclass_Private  workclass_Self-emp-inc  workclass_Self-emp-not-inc  \
0                0.0                     0.0                         1.0   
1                0.0                     0.0                         1.0   
2                1.0                     0.0                         0.0   
3                1.0                     0.0                         0.0   
4                1.0                     0.0                         0.0   

   workclass_State-gov  workclass_Without-pay  marital.status_Divorced  \
0                  0.0        

In [8]:
train_numerical = train.select_dtypes(include=['int64']).copy()
train_numerical.head()

test_numerical = test.select_dtypes(include=['int64']).copy()
test_numerical.head()

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,33,222162,9,0,0,40
1,68,29240,9,0,0,12
2,34,103596,9,0,0,40
3,57,103403,3,0,0,40
4,48,152915,10,0,0,40


In [9]:
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train_numerical.drop(columns=['income>50K'],axis=1))
train_scaled

test_scaled = scaler.fit_transform(test_numerical)
test_scaled

array([[-0.40809249,  0.29971762, -0.4218404 , -0.14246164, -0.2167449 ,
        -0.02523259],
       [ 2.1464691 , -1.53104332, -0.4218404 , -0.14246164, -0.2167449 ,
        -2.28930117],
       [-0.33510502, -0.82543141, -0.4218404 , -0.14246164, -0.2167449 ,
        -0.02523259],
       ...,
       [ 2.07348163, -0.07589195, -0.03198387,  2.58844601, -0.2167449 ,
        -1.64242443],
       [ 0.54074467,  0.7981801 ,  0.74772919, -0.14246164, -0.2167449 ,
        -0.02523259],
       [ 2.00049415, -1.35910996, -1.59140999,  0.33028189, -0.2167449 ,
        -0.02523259]])

In [10]:
train_scaled = pd.DataFrame(data = train_scaled,columns=train_numerical.columns[:-1])
train_scaled['income>50K'] = train_numerical['income>50K']
train_scaled

test_scaled = pd.DataFrame(data = test_scaled,columns=test_numerical.columns)
test_scaled

Unnamed: 0,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week
0,-0.408092,0.299718,-0.421840,-0.142462,-0.216745,-0.025233
1,2.146469,-1.531043,-0.421840,-0.142462,-0.216745,-2.289301
2,-0.335105,-0.825431,-0.421840,-0.142462,-0.216745,-0.025233
3,1.343607,-0.827263,-2.760980,-0.142462,-0.216745,-0.025233
4,0.686720,-0.357412,-0.031984,-0.142462,-0.216745,-0.025233
...,...,...,...,...,...,...
23837,-0.919005,-1.396594,1.137586,-0.142462,-0.216745,-0.025233
23838,-0.846017,-0.704192,1.137586,-0.142462,-0.216745,-0.025233
23839,2.073482,-0.075892,-0.031984,2.588446,-0.216745,-1.642424
23840,0.540745,0.798180,0.747729,-0.142462,-0.216745,-0.025233


In [11]:
final_train = pd.concat([train_categorical,train_scaled],axis=1)
print(final_train.shape)

final_test = pd.concat([test_categorical,test_scaled],axis=1)
print(final_test.shape)

(25000, 89)
(23842, 88)


In [12]:
x = final_train.drop('income>50K',axis = 1)
y = final_train['income>50K']

In [13]:
ros = RandomOverSampler()
x_sampled,y_sampled = ros.fit_resample(x,y)

In [14]:
def optimal_clusters(data):
    wcss_dist=[]
    for i in range (1,8):
        kmeans=KMeans(n_clusters=i,init='k-means++',random_state=10)
        kmeans.fit(data)
        wcss_dist.append(kmeans.inertia_)
    kn = KneeLocator(range(1, 8), wcss_dist, curve='convex', direction='decreasing')
    return kn.knee

no_of_clusters = optimal_clusters(x_sampled)
no_of_clusters

4

In [15]:
kmeans = None
def create_clusters(data,no_of_clusters):
    global kmeans
    kmeans = KMeans(n_clusters=no_of_clusters,init='k-means++',random_state=10)
    cluster_nos = kmeans.fit_predict(data)
    data['cluster_no'] = cluster_nos

create_clusters(x_sampled,no_of_clusters)
print(kmeans)


KMeans(n_clusters=4, random_state=10)


In [16]:
x_sampled['Labels'] = y_sampled
x_sampled.head()

Unnamed: 0,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,marital.status_Divorced,marital.status_Married-AF-spouse,...,native.country_Vietnam,native.country_Yugoslavia,age,fnlwgt,education.num,capital.gain,capital.loss,hours.per.week,cluster_no,Labels
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.042815,-0.901083,1.911731,-0.147005,-0.217501,-0.042502,3,1
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-0.415003,-0.617791,1.135501,-0.147005,-0.217501,-0.042502,3,1
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.60547,-0.415352,-0.41696,-0.147005,-0.217501,-0.042502,3,0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.095233,-0.701375,-0.41696,-0.147005,-0.217501,-0.042502,1,0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.022343,-0.691565,-0.41696,-0.147005,-0.217501,0.763076,3,0


In [20]:
def parameters_naive_bayes(x,y):
    print("Naive Bayes")
    naivebayes_parameters = {"var_smoothing": [1e-8,1e-9,1e-10,1e-11]}
    grid = GridSearchCV(estimator=GaussianNB(),param_grid=naivebayes_parameters,verbose=3)
    grid.fit(x,y)
    gnb = GaussianNB(var_smoothing=grid.best_params_['var_smoothing'])
    gnb.fit(x, y)
    return gnb
    
def parameters_neural_network(x,y):
    print("Neural Network")
    nn_parameters = {
        "activation":['tanh','relu'],
        "solver":['adam','sgd'],
        "hidden_layer_sizes":[(10,10),(15,15)],
        "max_iter":[600,700],
        "tol":[0.0001]
    }
    grid= GridSearchCV(MLPClassifier(),nn_parameters, verbose=3)
    grid.fit(x,y)
    nn = MLPClassifier(activation=grid.best_params_['activation'], solver= grid.best_params_['solver'],hidden_layer_sizes=grid.best_params_['hidden_layer_sizes'])
    nn.fit(x,y)
    return nn

def parameters_random_forest(x,y):
    print("Random Forest")
    rf_parameters = {"criterion": ['gini', 'entropy'],"max_depth": range(5, 10, 1),"n_estimators": [100,130,150]}
    grid= GridSearchCV(RandomForestClassifier(),rf_parameters, verbose=3)
    grid.fit(x,y)
    rf = RandomForestClassifier(criterion=grid.best_params_['criterion'], n_estimators= grid.best_params_['n_estimators'],max_depth=grid.best_params_['max_depth'])
    rf.fit(x,y)    
    return rf

def parameters_xgboost(x,y):
    print("XGBoost")
    xgboost_parameters = {"max_depth": range(5, 10, 1),"n_estimators": [100, 130,150]}
    grid= GridSearchCV(XGBClassifier(objective='binary:logistic'),xgboost_parameters,verbose=3)
    grid.fit(x, y)
    xgb = XGBClassifier(max_depth=grid.best_params_['max_depth'],n_estimators= grid.best_params_['n_estimators'])
    xgb.fit(x,y)
    return xgb

def best_model_selection(x_train,y_train,x_test,y_test):
    model_accuracy = {}
    random_forest=parameters_random_forest(x_train,y_train)
    pred_rf=random_forest.predict(x_test)
    if len(y_test.unique()) > 1: 
        rf_score = roc_auc_score(y_test, pred_rf)
    else:
        rf_score = accuracy_score(y_test, pred_rf)
    model_accuracy['random_forest'] = [rf_score,random_forest]
    
    neural_net = parameters_neural_network(x_train,y_train)
    pred_nn=neural_net.predict(x_test)
    if len(y_test.unique()) > 1: 
        nn_score = roc_auc_score(y_test, pred_nn)
    else:
        nn_score = accuracy_score(y_test, pred_nn) 
    model_accuracy['neural_net'] = [nn_score,neural_net]

    xgboost= parameters_xgboost(x_train,y_train)
    pred_xgb = xgboost.predict(x_test)
    if len(y_test.unique()) > 1: 
        xgbscore = roc_auc_score(y_test, pred_xgb)
    else:
        xgbscore = accuracy_score(y_test, pred_xgb)
    model_accuracy['xgboost'] = [xgbscore,xgboost]

    naive_bayes=parameters_naive_bayes(x_train,y_train)
    pred_nb=naive_bayes.predict(x_test) 
    if len(y_test.unique()) > 1: 
        nb_score = roc_auc_score(y_test, pred_nb)
    else:
        nb_score = accuracy_score(y_test, pred_nb)
    model_accuracy['naive_bayes'] = [nb_score,naive_bayes]

    best_model = None
    best_score = model_accuracy['xgboost'][0]
    for model_name,values in model_accuracy.items():
      if best_score <= values[0]:
        best_score = values[0]
        best_model = values[1]      
    return best_model

In [34]:
list_of_clusters=x_sampled['cluster_no'].unique()
list_of_clusters

array([3, 1, 0, 2], dtype=int32)

In [22]:
cluster_model = {}
for i in list_of_clusters:
    data=x_sampled[x_sampled['cluster_no']==i]
    attributes=data.drop(['Labels','cluster_no'],axis=1)
    label= data['Labels']
    x_train, x_test, y_train, y_test = train_test_split(attributes, label, test_size=1 / 3, random_state=65)
    best_model = best_model_selection(x_train,y_train,x_test,y_test)
    cluster_model[i] = best_model

Random Forest
Fitting 5 folds for each of 30 candidates, totalling 150 fits
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.717 total time=   0.5s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.714 total time=   0.5s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.719 total time=   0.5s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.716 total time=   0.5s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=100;, score=0.714 total time=   0.5s
[CV 1/5] END criterion=gini, max_depth=5, n_estimators=130;, score=0.715 total time=   0.6s
[CV 2/5] END criterion=gini, max_depth=5, n_estimators=130;, score=0.714 total time=   0.6s
[CV 3/5] END criterion=gini, max_depth=5, n_estimators=130;, score=0.715 total time=   0.6s
[CV 4/5] END criterion=gini, max_depth=5, n_estimators=130;, score=0.716 total time=   0.6s
[CV 5/5] END criterion=gini, max_depth=5, n_estimators=130;, score=0.709 total time=   0.6s
[CV 

In [35]:
cluster_model

{3: XGBClassifier(criterion='gini', max_depth=9, n_estimators=150),
 1: XGBClassifier(criterion='gini', max_depth=9, n_estimators=150),
 0: XGBClassifier(criterion='gini', max_depth=5, n_estimators=130),
 2: GaussianNB(var_smoothing=1e-08)}

In [41]:
for i in list_of_clusters:
  data=x_sampled[x_sampled['cluster_no']==i]
  labels = data['Labels']
  model = cluster_model[i]
  predicted = model.predict(data.drop(['cluster_no','Labels'],axis=1))
  accuracy = np.sum(labels == predicted)/len(predicted)
  print("cluster:",i," model:",model," training data:",len(predicted)," accuracy:",accuracy)

cluster: 3  model: XGBClassifier(criterion='gini', max_depth=9, n_estimators=150)  training data: 20116  accuracy: 0.8818850666136409
cluster: 1  model: XGBClassifier(criterion='gini', max_depth=9, n_estimators=150)  training data: 14969  accuracy: 0.9734785222793774
cluster: 0  model: XGBClassifier(criterion='gini', max_depth=5, n_estimators=130)  training data: 2496  accuracy: 0.9935897435897436
cluster: 2  model: GaussianNB(var_smoothing=1e-08)  training data: 387  accuracy: 1.0


In [24]:
test_clusters_groups = kmeans.predict(final_test)
test_clusters_groups

array([3, 1, 3, ..., 3, 3, 3], dtype=int32)

In [25]:
final_test['cluster_no'] = test_clusters_groups
test_clusters = final_test['cluster_no'].unique()
test_clusters

array([3, 1, 0, 2], dtype=int32)

In [26]:
def write_output(output=[],predictions = [],filename='output.csv'):
    fields = ['ID','Prediction']
    with open(filename, 'w',newline = '') as csvfile: 
        csvwriter = csv.writer(csvfile,delimiter=',') 
        csvwriter.writerow(fields) 
        for i,j in output:
            csvwriter.writerow([i,j])

In [33]:
output = []
for i in test_clusters:
    cluster_data= final_test[final_test['cluster_no']==i]
    cluster_data = cluster_data.drop(['cluster_no'],axis=1)
    model = cluster_model[i]
    result_prob = model.predict_proba(cluster_data)
    if result_prob.shape[1] == 1:
      final_result = [0.99999 for i in range(result_prob.shape[0])]
    else:  
      final_result = result_prob[:,1]
    print(final_result)
    indexes = cluster_data.index+1
    output+=list(zip(indexes,final_result))
print(output)
write_output(output)


[0.27483582 0.28953782 0.16329373 ... 0.96194214 0.64453185 0.00449514]
[0.01276089 0.05133997 0.03682694 ... 0.00036543 0.01083566 0.01344752]
[0.0185744  0.00366298 0.00577161 ... 0.09878014 0.9821034  0.00581302]
[0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 0.99999, 