In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
 ## "skipinitialspace = True"  is used to trim the white spaces from all cells in the dataframe

column_names = ['age', 'workclass', 'fnlwgt' , 'education', 'education_num' , 'marital_status', 'occupation', 'relationship', 'race', 'sex','capital_gain' ,'capital_loss' , 'hours_per_week', 'native_country', 'income']
data_df = pd.read_csv('adult.csv', names=column_names, skipinitialspace = True)
data_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [None]:
# data_df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)
# data_df.head()

#### Check count of unique values in each column.

In [5]:
data_df.nunique().sort_values()

sex                   2
income                2
race                  5
relationship          6
marital_status        7
workclass             9
occupation           15
education            16
education_num        16
native_country       42
age                  73
capital_loss         92
hours_per_week       94
capital_gain        119
fnlwgt            21648
dtype: int64

In [6]:
cat_columns = data_df.select_dtypes(include=['object']).columns
num_columns = data_df.select_dtypes(exclude=['object']).columns

print(f"Categorical columns : {cat_columns}")
print(f"Integer columns : {num_columns}")

Categorical columns : Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')
Integer columns : Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')


In [7]:
for col in cat_columns:
    print(f"\nNo of unique values in column \'{col.upper()}\' : {data_df[col].nunique()} and their values :\n{'####'*30} \n {data_df[col].unique()}")


No of unique values in column 'WORKCLASS' : 9 and their values :
######################################################################################################################## 
 ['State-gov' 'Self-emp-not-inc' 'Private' 'Federal-gov' 'Local-gov' '?'
 'Self-emp-inc' 'Without-pay' 'Never-worked']

No of unique values in column 'EDUCATION' : 16 and their values :
######################################################################################################################## 
 ['Bachelors' 'HS-grad' '11th' 'Masters' '9th' 'Some-college' 'Assoc-acdm'
 'Assoc-voc' '7th-8th' 'Doctorate' 'Prof-school' '5th-6th' '10th'
 '1st-4th' 'Preschool' '12th']

No of unique values in column 'MARITAL_STATUS' : 7 and their values :
######################################################################################################################## 
 ['Never-married' 'Married-civ-spouse' 'Divorced' 'Married-spouse-absent'
 'Separated' 'Married-AF-spouse' 'Widowed']

No of unique values 

## From the above output, columns 'WORKCLASS', 'OCCUPATION' and 'NATIVE_COUNTRY' Looking for Null values in the dataset

In [8]:
data_df.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
income            0
dtype: int64

In [9]:
for col in data_df.columns:
    if data_df[col].dtype=='O':
        print(f'{col} -- ', data_df[col].dtype, data_df[col].dtype=='O') 
    
print("\n\n", data_df.select_dtypes(include=['object']).columns)

workclass --  object True
education --  object True
marital_status --  object True
occupation --  object True
relationship --  object True
race --  object True
sex --  object True
native_country --  object True
income --  object True


 Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country', 'income'],
      dtype='object')


In [10]:
no_rows, no_cols = data_df.shape
no_rows

32561

### Finding count of unknown/null values in columns represented as '?'

In [11]:
cols_with_qm = ['workclass', 'occupation', 'native_country']

for col in cols_with_qm:
    print(f"Count of values with cell value as '?' for column {col.upper()} : {(data_df.loc[:, col]=='?').sum()} i.e {((data_df.loc[:, col]=='?').sum()/no_rows)*100} percentage of total values")


Count of values with cell value as '?' for column WORKCLASS : 1836 i.e 5.638647461687294 percentage of total values
Count of values with cell value as '?' for column OCCUPATION : 1843 i.e 5.660145572924664 percentage of total values
Count of values with cell value as '?' for column NATIVE_COUNTRY : 583 i.e 1.7904855501980899 percentage of total values


## Now replacing these  question marks with np.Nan values

In [12]:
for col in cols_with_qm:
    data_df[col].replace('?', np.NaN, inplace=True)

data_df[cols_with_qm].isnull().sum()

workclass         1836
occupation        1843
native_country     583
dtype: int64

## Filling missing values using Imputation techniques

In [13]:
data_df.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education_num        0
marital_status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital_gain         0
capital_loss         0
hours_per_week       0
native_country     583
income               0
dtype: int64

## Encoding labels

In [14]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data_df['income']= label_encoder.fit_transform(data_df['income'])
data_df['income'].unique()

array([0, 1])

In [15]:
data_df['income'].value_counts()

0    24720
1     7841
Name: income, dtype: int64

In [16]:
labels = data_df['income']
data_df.drop(['income'], inplace=True, axis=1)
data_df.head(2)

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States


## Train Test Split data

In [17]:
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(data_df, labels, test_size=0.2, random_state=567, stratify=labels)

In [18]:
train_x.shape

(26048, 14)

In [19]:
cat_cols = train_x.select_dtypes(include=['object']).columns
int_cols = train_x.select_dtypes(exclude=['object']).columns

print(cat_cols)
print(int_cols)

Index(['workclass', 'education', 'marital_status', 'occupation',
       'relationship', 'race', 'sex', 'native_country'],
      dtype='object')
Index(['age', 'fnlwgt', 'education_num', 'capital_gain', 'capital_loss',
       'hours_per_week'],
      dtype='object')


## Imputation techniques

### MICE (Multi-Variate Imputation by Chained Equations) method

In [20]:
train_x[int_cols].head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
23429,36,215392,11,0,0,40
26543,41,193882,13,0,0,40
14032,48,155372,9,0,0,36
13229,53,83434,13,0,0,21
4897,36,398931,13,0,1485,50


In [21]:
from impyute.imputation.cs import mice

In [22]:
train_x[int_cols] = train_x.loc[:, int_cols].astype('float64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [23]:
train_x[int_cols].isnull().sum()

age               0
fnlwgt            0
education_num     0
capital_gain      0
capital_loss      0
hours_per_week    0
dtype: int64

In [24]:
## Since from the above output there are no null values in the numerical columns, we ignore this step

# train_x_int_mice_imputed = mice(train_x[int_cols].values)

### Using Simple Imputer for categorical columns with null values

In [25]:
from sklearn.impute import SimpleImputer

simp_imp = SimpleImputer(strategy='constant', fill_value='missing')
train_x = pd.DataFrame(data=simp_imp.fit_transform(train_x), columns=train_x.columns)


In [26]:
test_x = pd.DataFrame(data=simp_imp.fit_transform(test_x), columns=test_x.columns)

In [27]:
train_x.workclass.value_counts()

Private             18168
Self-emp-not-inc     2027
Local-gov            1686
missing              1450
State-gov            1033
Self-emp-inc          898
Federal-gov           768
Without-pay            12
Never-worked            6
Name: workclass, dtype: int64

### KNN Method

In [28]:
# from fancyimpute import KNN

# imputer = KNN()
# train_x_imputed = pd.DataFrame(np.round(imputer.fit_transform(train_x)),columns = train_x.columns)
# train_x_imputed.head()

### SKlearn KNN Impute method

In [29]:
# from sklearn.impute import KNNImputer
# from sklearn.preprocessing import LabelEncoder
                                     
# knn = KNNImputer(n_neighbors=10, add_indicator=True)
# # knn = KNNImputer(missing_values=np.nan, n_neighbors=5, weights='uniform', metric='nan_euclidean')

# knn.fit(train_x)
# train_x[cat_cols] = pd.DataFrame(data=knn.fit_transform(train_x[cat_cols]), columns=train_x.columns)

# train_x.head()

In [30]:
# from sklearn.impute import SimpleImputer
# strategies = ['mean', 'median', 'most_frequent', 'constant']

# sim_imp = SimpleImputer(strategy='most_frequent')

# for col in cols_with_qm:
#     data_df[col]=sim_imp.fit_transform(data_df[col].values.reshape(-1,1))[:, 0]

# data_df[cols_with_qm].head()

In [31]:
train_x.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education_num     0
marital_status    0
occupation        0
relationship      0
race              0
sex               0
capital_gain      0
capital_loss      0
hours_per_week    0
native_country    0
dtype: int64

## Encoding Categorical data

In [32]:
import category_encoders as ce

In [33]:
cat_enc = ce.OneHotEncoder(cols=cat_cols, return_df=True)

train_x = cat_enc.fit_transform(train_x)
test_x = cat_enc.transform(test_x)

In [34]:
train_x.head(5)

Unnamed: 0,age,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,workclass_8,workclass_9,...,native_country_33,native_country_34,native_country_35,native_country_36,native_country_37,native_country_38,native_country_39,native_country_40,native_country_41,native_country_42
0,36,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,41,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,48,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,53,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,36,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Scaling Features

In [35]:
from sklearn.preprocessing import RobustScaler, StandardScaler, MinMaxScaler

columns = train_x.columns

# scaler = RobustScaler()
# train_x  = pd.DataFrame( scaler.fit_transform(train_x), columns=train_x.columns)
# test_x = pd.DataFrame( scaler.transform(test_x), columns=test_x.columns)

scaler = MinMaxScaler()
train_x  = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

In [36]:
train_x.head(3)

Unnamed: 0,age,workclass_1,workclass_2,workclass_3,workclass_4,workclass_5,workclass_6,workclass_7,workclass_8,workclass_9,...,native_country_33,native_country_34,native_country_35,native_country_36,native_country_37,native_country_38,native_country_39,native_country_40,native_country_41,native_country_42
0,-0.05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.55,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
from lazypredict.Supervised import LazyClassifier, LazyRegressor

lzc = LazyClassifier(verbose=1,
    ignore_warnings=True,
    predictions=True,
    random_state=42,
    classifiers='all')

models, preds = lzc.fit(train_x, test_x, train_y, test_y)

  3%|██▊                                                                                | 1/29 [00:02<01:01,  2.20s/it]

{'Model': 'AdaBoostClassifier', 'Accuracy': 0.8610471364962383, 'Balanced Accuracy': 0.7769631765749778, 'ROC AUC': 0.7769631765749778, 'F1 Score': 0.8556816469942363, 'Time taken': 2.203733205795288}


 10%|████████▌                                                                          | 3/29 [00:04<00:33,  1.30s/it]

{'Model': 'BaggingClassifier', 'Accuracy': 0.8433901427913404, 'Balanced Accuracy': 0.7559713609913126, 'ROC AUC': 0.7559713609913127, 'F1 Score': 0.8380043802235441, 'Time taken': 2.246957540512085}
{'Model': 'BernoulliNB', 'Accuracy': 0.7798249654537079, 'Balanced Accuracy': 0.7831425011865212, 'ROC AUC': 0.7831425011865211, 'F1 Score': 0.7922701725528635, 'Time taken': 0.1882338523864746}


 17%|██████████████▎                                                                    | 5/29 [00:36<03:28,  8.67s/it]

{'Model': 'CalibratedClassifierCV', 'Accuracy': 0.8547520343927529, 'Balanced Accuracy': 0.7684622686283815, 'ROC AUC': 0.7684622686283815, 'F1 Score': 0.8491649204535106, 'Time taken': 32.164515018463135}


 24%|████████████████████                                                               | 7/29 [00:37<01:28,  4.00s/it]

{'Model': 'DecisionTreeClassifier', 'Accuracy': 0.8139106402579457, 'Balanced Accuracy': 0.7520191494191206, 'ROC AUC': 0.7520191494191205, 'F1 Score': 0.8150894887032895, 'Time taken': 0.46400022506713867}
{'Model': 'DummyClassifier', 'Accuracy': 0.6361123906034085, 'Balanced Accuracy': 0.5007877468479809, 'ROC AUC': 0.5007877468479809, 'F1 Score': 0.635553689456495, 'Time taken': 0.14003324508666992}


 28%|██████████████████████▉                                                            | 8/29 [00:37<00:58,  2.79s/it]

{'Model': 'ExtraTreeClassifier', 'Accuracy': 0.7910333179794258, 'Balanced Accuracy': 0.716265721404841, 'ROC AUC': 0.716265721404841, 'F1 Score': 0.7914614234960989, 'Time taken': 0.19596338272094727}


 31%|█████████████████████████▊                                                         | 9/29 [00:42<01:10,  3.53s/it]

{'Model': 'ExtraTreesClassifier', 'Accuracy': 0.8283433133732535, 'Balanced Accuracy': 0.7515064949134356, 'ROC AUC': 0.7515064949134356, 'F1 Score': 0.8258302085924, 'Time taken': 5.1493682861328125}


 34%|████████████████████████████▎                                                     | 10/29 [00:43<00:47,  2.50s/it]

{'Model': 'GaussianNB', 'Accuracy': 0.4375863657300783, 'Balanced Accuracy': 0.6211330373908917, 'ROC AUC': 0.6211330373908917, 'F1 Score': 0.4276896898053425, 'Time taken': 0.20917248725891113}


 38%|███████████████████████████████                                                   | 11/29 [01:07<02:46,  9.26s/it]

{'Model': 'KNeighborsClassifier', 'Accuracy': 0.8205128205128205, 'Balanced Accuracy': 0.7365503317100348, 'ROC AUC': 0.7365503317100348, 'F1 Score': 0.8170014132156714, 'Time taken': 24.57962656021118}


 41%|█████████████████████████████████▉                                                | 12/29 [02:06<06:56, 24.47s/it]

{'Model': 'LabelPropagation', 'Accuracy': 0.7959465684016582, 'Balanced Accuracy': 0.7129683534182125, 'ROC AUC': 0.7129683534182125, 'F1 Score': 0.7942625426402606, 'Time taken': 59.26423454284668}


 45%|████████████████████████████████████▊                                             | 13/29 [03:12<09:49, 36.87s/it]

{'Model': 'LabelSpreading', 'Accuracy': 0.7984031936127745, 'Balanced Accuracy': 0.7152394451208188, 'ROC AUC': 0.7152394451208188, 'F1 Score': 0.7965017422733167, 'Time taken': 65.39831256866455}


 48%|███████████████████████████████████████▌                                          | 14/29 [03:13<06:29, 25.94s/it]

{'Model': 'LinearDiscriminantAnalysis', 'Accuracy': 0.8427759864885613, 'Balanced Accuracy': 0.7490339525597904, 'ROC AUC': 0.7490339525597904, 'F1 Score': 0.836069163536881, 'Time taken': 0.684241533279419}


 52%|██████████████████████████████████████████▍                                       | 15/29 [03:21<04:48, 20.61s/it]

{'Model': 'LinearSVC', 'Accuracy': 0.8526024873330262, 'Balanced Accuracy': 0.7666111667113762, 'ROC AUC': 0.7666111667113762, 'F1 Score': 0.8471492043684686, 'Time taken': 8.239470481872559}


 59%|████████████████████████████████████████████████                                  | 17/29 [03:21<02:02, 10.22s/it]

{'Model': 'LogisticRegression', 'Accuracy': 0.8512206356517734, 'Balanced Accuracy': 0.7667899831823528, 'ROC AUC': 0.7667899831823527, 'F1 Score': 0.8461255095531539, 'Time taken': 0.4914374351501465}
{'Model': 'NearestCentroid', 'Accuracy': 0.7594042683863043, 'Balanced Accuracy': 0.7912533403148924, 'ROC AUC': 0.7912533403148924, 'F1 Score': 0.7756167122400222, 'Time taken': 0.1556553840637207}


 69%|████████████████████████████████████████████████████████▌                         | 20/29 [03:22<00:32,  3.64s/it]

{'Model': 'PassiveAggressiveClassifier', 'Accuracy': 0.7778289574696761, 'Balanced Accuracy': 0.7012548750541673, 'ROC AUC': 0.7012548750541673, 'F1 Score': 0.7789388103683671, 'Time taken': 0.255997896194458}
{'Model': 'Perceptron', 'Accuracy': 0.7888837709196991, 'Balanced Accuracy': 0.6697727296176307, 'ROC AUC': 0.6697727296176306, 'F1 Score': 0.7782013659257178, 'Time taken': 0.21604585647583008}


 72%|███████████████████████████████████████████████████████████▍                      | 21/29 [03:22<00:21,  2.68s/it]

{'Model': 'QuadraticDiscriminantAnalysis', 'Accuracy': 0.5984953170581913, 'Balanced Accuracy': 0.6365082875920844, 'ROC AUC': 0.6365082875920843, 'F1 Score': 0.6273819887196069, 'Time taken': 0.4280102252960205}


 79%|█████████████████████████████████████████████████████████████████                 | 23/29 [03:26<00:12,  2.15s/it]

{'Model': 'RandomForestClassifier', 'Accuracy': 0.8524489482573315, 'Balanced Accuracy': 0.777833850931677, 'ROC AUC': 0.777833850931677, 'F1 Score': 0.8491398049122639, 'Time taken': 3.694007396697998}
{'Model': 'RidgeClassifier', 'Accuracy': 0.839705204974666, 'Balanced Accuracy': 0.7269772987557005, 'ROC AUC': 0.7269772987557005, 'F1 Score': 0.8285321471089622, 'Time taken': 0.19999265670776367}


 83%|███████████████████████████████████████████████████████████████████▊              | 24/29 [03:27<00:08,  1.65s/it]

{'Model': 'RidgeClassifierCV', 'Accuracy': 0.839705204974666, 'Balanced Accuracy': 0.7269772987557005, 'ROC AUC': 0.7269772987557005, 'F1 Score': 0.8285321471089622, 'Time taken': 0.48800039291381836}


 86%|██████████████████████████████████████████████████████████████████████▋           | 25/29 [03:27<00:05,  1.33s/it]

{'Model': 'SGDClassifier', 'Accuracy': 0.8460003070781514, 'Balanced Accuracy': 0.7672719429025401, 'ROC AUC': 0.7672719429025402, 'F1 Score': 0.8422196561250589, 'Time taken': 0.5917069911956787}


 90%|█████████████████████████████████████████████████████████████████████████▌        | 26/29 [04:28<00:56, 18.97s/it]

{'Model': 'SVC', 'Accuracy': 0.8515277138031629, 'Balanced Accuracy': 0.7547973499308722, 'ROC AUC': 0.7547973499308723, 'F1 Score': 0.8438723668345367, 'Time taken': 60.13037657737732}


 97%|███████████████████████████████████████████████████████████████████████████████▏  | 28/29 [04:33<00:11, 11.57s/it]

{'Model': 'XGBClassifier', 'Accuracy': 0.8734838016275142, 'Balanced Accuracy': 0.8051876766884711, 'ROC AUC': 0.8051876766884711, 'F1 Score': 0.8704285438954787, 'Time taken': 5.852156162261963}


100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [04:34<00:00,  9.47s/it]

{'Model': 'LGBMClassifier', 'Accuracy': 0.8713342545677875, 'Balanced Accuracy': 0.8005056256577454, 'ROC AUC': 0.8005056256577454, 'F1 Score': 0.8679503994240579, 'Time taken': 0.7869865894317627}





In [50]:
models

Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.87,0.81,0.81,0.87,5.85
LGBMClassifier,0.87,0.8,0.8,0.87,0.79
NearestCentroid,0.76,0.79,0.79,0.78,0.16
BernoulliNB,0.78,0.78,0.78,0.79,0.19
RandomForestClassifier,0.85,0.78,0.78,0.85,3.69
AdaBoostClassifier,0.86,0.78,0.78,0.86,2.2
CalibratedClassifierCV,0.85,0.77,0.77,0.85,32.16
SGDClassifier,0.85,0.77,0.77,0.84,0.59
LogisticRegression,0.85,0.77,0.77,0.85,0.49
LinearSVC,0.85,0.77,0.77,0.85,8.24
