In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
df=pd.read_csv('adult.csv')

In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
print(df.isnull().sum())      

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
country           0
salary            0
dtype: int64


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       32561 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education       32561 non-null  object
 4   education-num   32561 non-null  int64 
 5   marital-status  32561 non-null  object
 6   occupation      32561 non-null  object
 7   relationship    32561 non-null  object
 8   race            32561 non-null  object
 9   sex             32561 non-null  object
 10  capital-gain    32561 non-null  int64 
 11  capital-loss    32561 non-null  int64 
 12  hours-per-week  32561 non-null  int64 
 13  country         32561 non-null  object
 14  salary          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [6]:
for feature in df.columns[:]:
    print(feature,":",len(df[feature].unique()))

age : 73
workclass : 9
fnlwgt : 21648
education : 16
education-num : 16
marital-status : 7
occupation : 15
relationship : 6
race : 5
sex : 2
capital-gain : 119
capital-loss : 92
hours-per-week : 94
country : 42
salary : 2


In [7]:
print((df['salary'].value_counts()))
print(f"<= 50k : {round(24720 /32561 * 100 , 2)}")
print(f"> 50k : {round(7841 /32561 * 100 , 2)}")

 <=50K    24720
 >50K      7841
Name: salary, dtype: int64
<= 50k : 75.92
> 50k : 24.08


#Data is imbalanced

In [9]:
#List numerical variables
numerical_features = [feature for feature in df.columns if df[feature].dtypes != 'O']

print('Number of numerical variables: ', len(numerical_features))

df[numerical_features].head()
numerical_features=df[numerical_features]

Number of numerical variables:  6


In [10]:
#List categorical variables
categorical_features = [feature for feature in df.columns if df[feature].dtypes == 'O']

print('Number of categorical variables: ', len(categorical_features))

df[categorical_features].head()
categorical_features=df[categorical_features]
categorical_features.head(1)

Number of categorical variables:  9


Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,country,salary
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K


In [11]:
for feature in  categorical_features:
    print(df[feature].value_counts()) 
    print("\n \n ")

 Private             22696
 Self-emp-not-inc     2541
 Local-gov            2093
 ?                    1836
 State-gov            1298
 Self-emp-inc         1116
 Federal-gov           960
 Without-pay            14
 Never-worked            7
Name: workclass, dtype: int64

 
 
 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64

 
 
 Married-civ-spouse       14976
 Never-married            10683
 Divorced                  4443
 Separated                 1025
 Widowed                    993
 Married-spouse-absent      418
 Married-AF-spouse           23
Name: marital-status, dtype: int64

 
 
 Prof-specialty       4140
 Craft-repair         4099
 

#we have [?] in data which is missing value 

In [13]:
df.isin([' ?']).sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
country            583
salary               0
dtype: int64

'''we can drop education column it is highly co related to education-num 
and it also seem like education column has already maping as education-num'''

In [14]:
df.drop('education', axis=1, inplace=True)

we dont have too much missing data we can use frequent category imputation 

# Feature engineering

In [15]:
nan_features=['country','occupation','workclass']

In [16]:
occupation=df["occupation"].mode()[0]
workclass=df["workclass"].mode()[0]
country=df['country'].mode()[0]

In [17]:
df['workclass'] = df['workclass'].replace('?', workclass )
df['occupation'] = df['occupation'].replace('?', occupation )
df['country'] = df['country'].replace('?', country )

In [18]:
df["salary"] = df["salary"].replace({'<=50K' : 0 , ">50K" : 1 } , regex = True)

In [19]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


## Encoding

In [20]:
from sklearn.preprocessing import  LabelEncoder

In [21]:
df = df.apply(LabelEncoder().fit_transform)
df .head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,country,salary
0,22,7,2671,12,4,1,1,4,1,25,0,39,39,0
1,33,6,2926,12,2,4,0,4,1,0,0,12,39,0
2,21,4,14086,8,0,6,1,4,1,0,0,39,39,0
3,36,4,15336,6,2,6,0,2,1,0,0,39,39,0
4,11,4,19355,12,2,10,5,2,0,0,0,39,5,0


In [22]:
nominal_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'country']

In [23]:
def onehot_encode(df, columns):
    for column in columns:
        dummies = pd.get_dummies(df[column])
        df = pd.concat([df, dummies], axis=1)
        df.drop(column, axis=1, inplace=True)
    return df

In [24]:
df = onehot_encode(df, nominal_features)

In [25]:
df.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary,0,1,...,32,33,34,35,36,37,38,39,40,41
0,22,2671,12,1,25,0,39,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,33,2926,12,1,0,0,12,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,21,14086,8,1,0,0,39,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,36,15336,6,1,0,0,39,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,11,19355,12,0,0,0,39,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Model

In [26]:
from sklearn.preprocessing import MinMaxScaler

In [27]:
y = df['salary']
X = df.drop('salary', axis=1)

In [28]:
scaler = MinMaxScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [29]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8)

In [30]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression

In [31]:
lr = LogisticRegression(class_weight="balanced")
lr.fit(X_train, y_train)
prediction = lr.predict(X_test)
accuracy_score(y_test, prediction)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.810379241516966

In [32]:
print(classification_report(y_test, prediction))

              precision    recall  f1-score   support

           0       0.94      0.81      0.87      4971
           1       0.57      0.83      0.67      1542

    accuracy                           0.81      6513
   macro avg       0.75      0.82      0.77      6513
weighted avg       0.85      0.81      0.82      6513



In [47]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators = 20, criterion = 'entropy' ,random_state = 51)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.8518347919545525

In [34]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.91      4971
           1       0.73      0.60      0.66      1542

    accuracy                           0.85      6513
   macro avg       0.80      0.77      0.78      6513
weighted avg       0.85      0.85      0.85      6513



# Hyperparameter tuning

In [58]:
params={'max_depth':[3,5,10,None],
              'n_estimators':[10,100,200,300,400,500],
              'max_features':[4,2,6],
               'criterion':['gini','entropy'],
               'bootstrap':[True,False],
               'min_samples_leaf': [5,2,3,4],
              }

In [59]:
# Randomized Search
from sklearn.model_selection import RandomizedSearchCV
random_search = RandomizedSearchCV(rf_classifier, param_distributions=params, scoring= 'roc_auc', n_jobs= -1, verbose= 3 ,random_state = 23)
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


RandomizedSearchCV(estimator=RandomForestClassifier(class_weight='balanced_subsample',
                                                    max_features=4,
                                                    min_samples_leaf=4,
                                                    n_estimators=300,
                                                    random_state=51),
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'criterion': ['gini', 'entropy'],
                                        'max_depth': [3, 5, 10, None],
                                        'max_features': [4, 2, 6],
                                        'min_samples_leaf': [5, 2, 3, 4],
                                        'n_estimators': [10, 100, 200, 300, 400,
                                                         500]},
                   random_state=23, scoring='roc_auc', verbose=3)

In [60]:
random_search.best_params_

{'n_estimators': 300,
 'min_samples_leaf': 4,
 'max_features': 4,
 'max_depth': None,
 'criterion': 'gini',
 'bootstrap': True}

In [62]:
random_search.best_estimator_

RandomForestClassifier(class_weight='balanced_subsample', max_features=4,
                       min_samples_leaf=4, n_estimators=300, random_state=51)

In [61]:
rf_classifier = RandomForestClassifier(bootstrap=True, class_weight='balanced_subsample',
                       criterion='gini', max_features=4,
                       min_samples_leaf=4, n_estimators=300, random_state=51)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)
accuracy_score(y_test, y_pred)

0.7950253339474896

In [53]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      0.77      0.85      4971
           1       0.54      0.87      0.67      1542

    accuracy                           0.80      6513
   macro avg       0.75      0.82      0.76      6513
weighted avg       0.85      0.80      0.81      6513

