In [51]:
# Harshal M Pohekar

# Income Category Prediction 

In [52]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
import warnings
warnings.filterwarnings('ignore')

In [53]:
# Reading dataset
data = pd.read_csv(r"D:\Downloads\Videos\Krish\Datasets\Income Adult\adult.csv")

In [54]:
# First 5 rows of data
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [55]:
# Data size
data.shape

(48842, 15)

In [56]:
# Data types and null counts
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        48842 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       48842 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   48842 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [57]:
# Encoding to nan
data[data == '?'] = np.nan

In [58]:
# Data types and null counts
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              48842 non-null  int64 
 1   workclass        46043 non-null  object
 2   fnlwgt           48842 non-null  int64 
 3   education        48842 non-null  object
 4   educational-num  48842 non-null  int64 
 5   marital-status   48842 non-null  object
 6   occupation       46033 non-null  object
 7   relationship     48842 non-null  object
 8   race             48842 non-null  object
 9   gender           48842 non-null  object
 10  capital-gain     48842 non-null  int64 
 11  capital-loss     48842 non-null  int64 
 12  hours-per-week   48842 non-null  int64 
 13  native-country   47985 non-null  object
 14  income           48842 non-null  object
dtypes: int64(6), object(9)
memory usage: 5.6+ MB


In [59]:
# Imputing missing values with mode
for col in ['workclass', 'occupation', 'native-country']:
    data[col].fillna(data[col].mode()[0], inplace=True)

In [60]:
data.isnull().sum()

age                0
workclass          0
fnlwgt             0
education          0
educational-num    0
marital-status     0
occupation         0
relationship       0
race               0
gender             0
capital-gain       0
capital-loss       0
hours-per-week     0
native-country     0
income             0
dtype: int64

In [61]:
# Target Variables
x = data.drop('income', axis=1)

y = data['income']

In [62]:
# Splitting data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0)

In [63]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,Private,103497,Some-college,10,Never-married,Prof-specialty,Own-child,White,Female,0,0,30,United-States,<=50K


In [64]:
# Preprocessing
from sklearn import preprocessing

# Label Encoding
categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
for feature in categorical:
    le = preprocessing.LabelEncoder()
    x_train[feature] = le.fit_transform(x_train[feature])
    x_test[feature] = le.fit_transform(x_test[feature])

In [65]:
# Standardization
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x.columns)
x_test = pd.DataFrame(scaler.fit_transform(x_test), columns = x.columns)

In [66]:
x_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country
0,-0.849978,-1.887643,-0.551219,1.212393,-0.027733,-0.406325,-1.554732,0.969833,0.390646,-1.425582,-0.144199,-0.216492,-0.034027,0.258084
1,0.241031,-0.094859,1.687545,-2.650223,-1.587187,-0.406325,-1.049322,0.969833,-0.797592,0.701468,-0.144199,-0.216492,0.207331,0.258084
2,-0.486308,1.697924,-1.434052,-0.590161,0.362131,-0.406325,-0.543912,-0.899325,0.390646,0.701468,0.850561,-0.216492,2.379553,0.258084
3,-0.195373,-0.094859,-0.384485,1.212393,-0.027733,0.92272,-0.796617,-0.276272,0.390646,-1.425582,-0.144199,-0.216492,1.575026,0.258084
4,-0.70451,-0.094859,1.608144,0.182362,-0.417596,1.587242,1.730434,1.592886,0.390646,-1.425582,-0.144199,-0.216492,-0.838553,0.258084


In [67]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

LR = LogisticRegression()
LR.fit(x_train, y_train)
y_pred = LR.predict(x_test)

print('Logistics Regression Accuracy Score with all features: {0:0.4f}', format(accuracy_score(y_test, y_pred)))

Logistics Regression Accuracy Score with all features: {0:0.4f} 0.8220842148365523


In [68]:
# PCA
from sklearn.decomposition import PCA
pca = PCA()

x_train = pca.fit_transform(x_train)
pca.explained_variance_ratio_

array([0.14740223, 0.10130193, 0.08096753, 0.07933632, 0.07433976,
       0.07314763, 0.07066221, 0.06753572, 0.06516078, 0.06093536,
       0.06003764, 0.04864317, 0.04289137, 0.02763835])

In [69]:
# Target Variables
x = data.drop('income', axis=1)

y = data['income']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.30, random_state = 0)

# Label Encoding
categorical = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'gender', 'native-country']
for feature in categorical:
    le = preprocessing.LabelEncoder()
    x_train[feature] = le.fit_transform(x_train[feature])
    x_test[feature] = le.fit_transform(x_test[feature])
    
x_train = pd.DataFrame(scaler.fit_transform(x_train), columns = x.columns)

pca = PCA()
pca.fit(x_train)
cumsum = np.cumsum(pca.explained_variance_ratio_)
dim = np.argmax(cumsum >= 0.90) + 1
print('The number of dimensions required to preserve 90% variance is: ', dim)

The number of dimensions required to preserve 90% variance is:  12
