In [1]:
import pandas as pd 
import numpy as np 

In [2]:
df = pd.read_csv('adult.csv')

In [3]:
df.head(3)

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [5]:
df.select_dtypes(include = ['int64' , 'float64']).columns

Index(['age', 'fnlwgt', 'educational-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

In [6]:
df.select_dtypes(include = ['object']).columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country', 'income'],
      dtype='object')

## Feature Identification

Before preprocessing, features were classified into numerical and categorical
based on their data types and nature.

### Numerical Features
- age
- fnlwgt
- education-num
- capital-gain
- capital-loss
- hours-per-week

### Categorical Features
- workclass
- education
- marital-status
- occupation
- relationship
- race
- sex
- native-country

### Target Variable
- income


In [7]:
encod_col = df[['education']]

In [8]:
from sklearn.preprocessing import OrdinalEncoder
encoded = OrdinalEncoder()
encoded_col = encoded.fit_transform(encod_col)

In [9]:
encoded_col = pd.DataFrame(encoded_col)

In [10]:
encoded_col.columns = columns = ['education_encoded']

In [11]:
encoded_col

Unnamed: 0,education_encoded
0,1.0
1,11.0
2,7.0
3,15.0
4,15.0
...,...
48837,7.0
48838,11.0
48839,11.0
48840,11.0


In [12]:
data =  pd.concat([df , encoded_col],axis = 1)
data.sample()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income,education_encoded
48772,33,Private,63079,HS-grad,9,Divorced,Adm-clerical,Unmarried,Black,Female,0,0,40,United-States,<=50K,11.0


In [13]:
oh_encoding = df[['workclass','marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country']]

In [14]:
from sklearn.preprocessing import OneHotEncoder
one_hot_encoder  = OneHotEncoder()
encoding_cat = one_hot_encoder.fit_transform(oh_encoding)

In [15]:
encoding_cat.toarray()

array([[0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 1., ..., 1., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.]])

In [16]:
one_hot_encoder.categories_

[array(['?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private',
        'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay'],
       dtype=object),
 array(['Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
        'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed'],
       dtype=object),
 array(['?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair',
        'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners',
        'Machine-op-inspct', 'Other-service', 'Priv-house-serv',
        'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support',
        'Transport-moving'], dtype=object),
 array(['Husband', 'Not-in-family', 'Other-relative', 'Own-child',
        'Unmarried', 'Wife'], dtype=object),
 array(['Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other',
        'White'], dtype=object),
 array(['Female', 'Male'], dtype=object),
 array(['?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba',
        'Dominican-Republic', 'Ecuador', 'El-Salv

In [17]:
encoded_data = pd.DataFrame(encoding_cat.toarray() ,columns = ['?', 'Federal-gov', 'Local-gov', 'Never-worked', 'Private',
        'Self-emp-inc', 'Self-emp-not-inc', 'State-gov', 'Without-pay','Divorced', 'Married-AF-spouse', 'Married-civ-spouse',
        'Married-spouse-absent', 'Never-married', 'Separated', 'Widowed','?', 'Adm-clerical', 'Armed-Forces', 'Craft-repair',
        'Exec-managerial', 'Farming-fishing', 'Handlers-cleaners',
        'Machine-op-inspct', 'Other-service', 'Priv-house-serv',
        'Prof-specialty', 'Protective-serv', 'Sales', 'Tech-support',
        'Transport-moving','Husband', 'Not-in-family', 'Other-relative', 'Own-child',
        'Unmarried', 'Wife','Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other',
        'White','Female', 'Male','?', 'Cambodia', 'Canada', 'China', 'Columbia', 'Cuba',
        'Dominican-Republic', 'Ecuador', 'El-Salvador', 'England',
        'France', 'Germany', 'Greece', 'Guatemala', 'Haiti',
        'Holand-Netherlands', 'Honduras', 'Hong', 'Hungary', 'India',
        'Iran', 'Ireland', 'Italy', 'Jamaica', 'Japan', 'Laos', 'Mexico',
        'Nicaragua', 'Outlying-US(Guam-USVI-etc)', 'Peru', 'Philippines',
        'Poland', 'Portugal', 'Puerto-Rico', 'Scotland', 'South', 'Taiwan',
        'Thailand', 'Trinadad&Tobago', 'United-States', 'Vietnam',
        'Yugoslavia'] , index = df.index )

In [18]:
encoded_data

Unnamed: 0,?,Federal-gov,Local-gov,Never-worked,Private,Self-emp-inc,Self-emp-not-inc,State-gov,Without-pay,Divorced,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [19]:
oh_data = pd.concat([data,encoded_data],axis = 1)

In [20]:
oh_data.sample()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
42802,28,Private,191935,Assoc-acdm,12,Never-married,Sales,Own-child,White,Male,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [21]:
data_for_model = oh_data.drop(['workclass','marital-status', 'occupation',
       'relationship', 'race', 'gender', 'native-country','education'],axis = 1)

In [22]:
data_for_model

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,education_encoded,?,Federal-gov,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,25,226802,7,0,0,40,<=50K,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,38,89814,9,0,0,50,<=50K,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,28,336951,12,0,0,40,>50K,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,44,160323,10,7688,0,40,>50K,15.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,18,103497,10,0,0,30,<=50K,15.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,27,257302,12,0,0,38,<=50K,7.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,40,154374,9,0,0,40,>50K,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,58,151910,9,0,0,40,<=50K,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,22,201490,9,0,0,20,<=50K,11.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [23]:
x = data_for_model.drop('income',axis = 1)
y = data_for_model['income']

In [24]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 42)

In [25]:
from sklearn.linear_model import LogisticRegression # model trained befor scaling 
model = LogisticRegression()
model.fit(x_train,y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
pred = model.predict(x_test)

In [27]:
from sklearn.metrics import accuracy_score

In [28]:
acr = accuracy_score(y_test,pred)

In [29]:
acr

0.8042788412324701

In [30]:
data_for_model.head(1)

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,income,education_encoded,?,Federal-gov,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,25,226802,7,0,0,40,<=50K,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [31]:
st_scale = data_for_model[['age','fnlwgt','educational-num','capital-gain','capital-loss','hours-per-week','education_encoded']]
st_scale


Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,education_encoded
0,25,226802,7,0,0,40,1.0
1,38,89814,9,0,0,50,11.0
2,28,336951,12,0,0,40,7.0
3,44,160323,10,7688,0,40,15.0
4,18,103497,10,0,0,30,15.0
...,...,...,...,...,...,...,...
48837,27,257302,12,0,0,38,7.0
48838,40,154374,9,0,0,40,11.0
48839,58,151910,9,0,0,40,11.0
48840,22,201490,9,0,0,20,11.0


In [32]:
from sklearn.preprocessing import StandardScaler
scaled = StandardScaler()
x = scaled.fit_transform(st_scale)

In [33]:
df_scaled = pd.DataFrame(x , columns = st_scale.columns ,index = st_scale.index)

In [34]:
df_scaled

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,education_encoded
0,-0.995129,0.351675,-1.197259,-0.144804,-0.217127,-0.034087,-2.397350
1,-0.046942,-0.945524,-0.419335,-0.144804,-0.217127,0.772930,0.183660
2,-0.776316,1.394723,0.747550,-0.144804,-0.217127,-0.034087,-0.848744
3,0.390683,-0.277844,-0.030373,0.886874,-0.217127,-0.034087,1.216063
4,-1.505691,-0.815954,-0.030373,-0.144804,-0.217127,-0.841104,1.216063
...,...,...,...,...,...,...,...
48837,-0.849254,0.640492,0.747550,-0.144804,-0.217127,-0.195490,-0.848744
48838,0.098933,-0.334178,-0.419335,-0.144804,-0.217127,-0.034087,0.183660
48839,1.411808,-0.357510,-0.419335,-0.144804,-0.217127,-0.034087,0.183660
48840,-1.213941,0.111984,-0.419335,-0.144804,-0.217127,-1.648120,0.183660


In [35]:
scaled_data = pd.concat([df_scaled,encoded_data] , axis = 1)

In [36]:
scaled_data

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,education_encoded,?,Federal-gov,Local-gov,...,Portugal,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia
0,-0.995129,0.351675,-1.197259,-0.144804,-0.217127,-0.034087,-2.397350,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,-0.046942,-0.945524,-0.419335,-0.144804,-0.217127,0.772930,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,-0.776316,1.394723,0.747550,-0.144804,-0.217127,-0.034087,-0.848744,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.390683,-0.277844,-0.030373,0.886874,-0.217127,-0.034087,1.216063,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,-1.505691,-0.815954,-0.030373,-0.144804,-0.217127,-0.841104,1.216063,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.849254,0.640492,0.747550,-0.144804,-0.217127,-0.195490,-0.848744,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48838,0.098933,-0.334178,-0.419335,-0.144804,-0.217127,-0.034087,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48839,1.411808,-0.357510,-0.419335,-0.144804,-0.217127,-0.034087,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
48840,-1.213941,0.111984,-0.419335,-0.144804,-0.217127,-1.648120,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [37]:
target_encoding = df['income']

In [38]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(target_encoding)

In [39]:
y

array([0, 0, 1, ..., 0, 0, 1])

In [40]:
tr_col = pd.Series(y , name = 'income' ,index = df.index)

In [41]:
tr_col = pd.DataFrame(y , columns = ['income'] , index = df.index)

In [42]:
tr_col

Unnamed: 0,income
0,0
1,0
2,1
3,1
4,0
...,...
48837,0
48838,1
48839,0
48840,0


In [43]:
prepared_data = pd.concat([scaled_data,tr_col],axis = 1)

In [44]:
prepared_data

Unnamed: 0,age,fnlwgt,educational-num,capital-gain,capital-loss,hours-per-week,education_encoded,?,Federal-gov,Local-gov,...,Puerto-Rico,Scotland,South,Taiwan,Thailand,Trinadad&Tobago,United-States,Vietnam,Yugoslavia,income
0,-0.995129,0.351675,-1.197259,-0.144804,-0.217127,-0.034087,-2.397350,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
1,-0.046942,-0.945524,-0.419335,-0.144804,-0.217127,0.772930,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
2,-0.776316,1.394723,0.747550,-0.144804,-0.217127,-0.034087,-0.848744,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
3,0.390683,-0.277844,-0.030373,0.886874,-0.217127,-0.034087,1.216063,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
4,-1.505691,-0.815954,-0.030373,-0.144804,-0.217127,-0.841104,1.216063,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,-0.849254,0.640492,0.747550,-0.144804,-0.217127,-0.195490,-0.848744,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
48838,0.098933,-0.334178,-0.419335,-0.144804,-0.217127,-0.034087,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1
48839,1.411808,-0.357510,-0.419335,-0.144804,-0.217127,-0.034087,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0
48840,-1.213941,0.111984,-0.419335,-0.144804,-0.217127,-1.648120,0.183660,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0


In [45]:
x = prepared_data.drop('income', axis = 1)
y = prepared_data['income']

In [46]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2, random_state = 42)

In [47]:
from sklearn.linear_model import LogisticRegression  # model trained after scaling
model = LogisticRegression()
model.fit(x_train,y_train)

In [48]:
pred = model.predict(x_test)

In [49]:
pred

array([0, 0, 1, ..., 1, 0, 1])

In [50]:
acr = accuracy_score(y_test,pred)

In [51]:
acr

0.8563824342307299

### Impact of Scaling 

- Logistic Regression uses gradient descent

- Features with large ranges dominate without scaling

### Scaling:

Improves convergence speed

Stabilizes coefficients

Improves accuracy

Tree-based models do not require scaling

In [52]:
processed_data = prepared_data.to_csv("adult_income_Processed.csv")

In [53]:
processed_data