### Census Income

Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset.

#### Import Files

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#### Read Data 

In [2]:
datac = pd.read_csv("adult.csv")

In [3]:
names = ['Age',
         'Workclass',
         'fnlwgt',
         'Education',
         "Education-num",
         'Marital-Status',
         'Occupation',
         'Relationship',
         'Race',
         'Sex',
         'Capital-gain',
         'Capital-loss',
         'Hours-per-week',
         'Native-country',
        'Income']
datac = pd.read_csv("adult.csv",names=names,na_values={
    'Workclass':[' ?'],
    'Occupation':[' ?'],
    'Native-country':[' ?']
})

In [4]:
datac = datac.dropna()

In [5]:
head = datac.head()
head

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


#### Exploratory Data Analysis

In [6]:
print("Instances:\t",datac.shape[0])
print("\nAttributes:\t",datac.shape[1])
print("\nNumber of null values:\n",datac.isnull().sum())
print("\nNumber of unique values\n",datac.nunique())
print("\nNumber of Predicted values\n\n",datac['Income'].value_counts())

Instances:	 30162

Attributes:	 15

Number of null values:
 Age               0
Workclass         0
fnlwgt            0
Education         0
Education-num     0
Marital-Status    0
Occupation        0
Relationship      0
Race              0
Sex               0
Capital-gain      0
Capital-loss      0
Hours-per-week    0
Native-country    0
Income            0
dtype: int64

Number of unique values
 Age                  72
Workclass             7
fnlwgt            20263
Education            16
Education-num        16
Marital-Status        7
Occupation           14
Relationship          6
Race                  5
Sex                   2
Capital-gain        118
Capital-loss         90
Hours-per-week       94
Native-country       41
Income                2
dtype: int64

Number of Predicted values

  <=50K    22654
 >50K      7508
Name: Income, dtype: int64


In [7]:
des = datac.describe(include = 'all')
des

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
count,30162.0,30162,30162.0,30162,30162.0,30162,30162,30162,30162,30162,30162.0,30162.0,30162.0,30162,30162
unique,,7,,16,,7,14,6,5,2,,,,41,2
top,,Private,,HS-grad,,Married-civ-spouse,Prof-specialty,Husband,White,Male,,,,United-States,<=50K
freq,,22286,,9840,,14065,4038,12463,25933,20380,,,,27504,22654
mean,38.437902,,189793.8,,10.121312,,,,,,1092.007858,88.372489,40.931238,,
std,13.134665,,105653.0,,2.549995,,,,,,7406.346497,404.29837,11.979984,,
min,17.0,,13769.0,,1.0,,,,,,0.0,0.0,1.0,,
25%,28.0,,117627.2,,9.0,,,,,,0.0,0.0,40.0,,
50%,37.0,,178425.0,,10.0,,,,,,0.0,0.0,40.0,,
75%,47.0,,237628.5,,13.0,,,,,,0.0,0.0,45.0,,


In [8]:
from sklearn.preprocessing import LabelEncoder

f_datac = datac.apply(LabelEncoder().fit_transform)
f_datac

Unnamed: 0,Age,Workclass,fnlwgt,Education,Education-num,Marital-Status,Occupation,Relationship,Race,Sex,Capital-gain,Capital-loss,Hours-per-week,Native-country,Income
0,22,5,2491,9,12,4,0,1,4,1,24,0,39,38,0
1,33,4,2727,9,12,2,3,0,4,1,0,0,12,38,0
2,21,2,13188,11,8,0,5,1,4,1,0,0,39,38,0
3,36,2,14354,1,6,2,5,0,2,1,0,0,39,38,0
4,11,2,18120,9,12,2,9,5,2,0,0,0,39,4,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,10,2,15471,7,11,2,12,5,4,0,0,0,37,38,0
32557,23,2,7555,11,8,2,6,0,4,1,0,0,39,38,1
32558,41,2,7377,11,8,6,0,4,4,0,0,0,39,38,0
32559,5,2,12060,11,8,4,0,3,4,1,0,0,19,38,0


#### Splitting the data

In [9]:
x = f_datac.drop(columns=['Income'])
y = f_datac['Income']

In [10]:
x.shape

(30162, 14)

In [11]:
y.shape

(30162,)

#### Feature selection -- Univariate Selection

In [13]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2

In [14]:
#apply SelectKBest class to extract top 10 best features
bestfeatures = SelectKBest(score_func=chi2, k=10)
fit = bestfeatures.fit(x,y)
dfscores = pd.DataFrame(fit.scores_)
dfcolumns = pd.DataFrame(x.columns)

In [15]:
#concat two dataframes for better visualization
featureScores = pd.concat([dfcolumns,dfscores],axis=1)
featureScores.columns = ['Specs','Score'] #naming the dataframe columns
print(featureScores.nlargest(10,'Score')) #print 10 best features

             Specs          Score
10    Capital-gain  283956.499315
11    Capital-loss   38445.074339
0              Age   14218.766333
2           fnlwgt    9795.523872
12  Hours-per-week    5658.160509
7     Relationship    3435.488060
4    Education-num    2417.121182
5   Marital-Status     982.382620
9              Sex     459.346270
3        Education     264.652350


In [16]:
new_x = f_datac.drop(columns=['Education','Native-country','Race', 'Sex'])
new_y = f_datac['Income']

#### Standardize the data

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

x_train,x_test,y_train,y_test = train_test_split(new_x,new_y,test_size = 0.33)

In [18]:
stdsclr = StandardScaler()
x_train_std = stdsclr.fit_transform(x_train)
x_test_std = stdsclr.fit_transform(x_test)

In [19]:
x_train

Unnamed: 0,Age,Workclass,fnlwgt,Education-num,Marital-Status,Occupation,Relationship,Capital-gain,Capital-loss,Hours-per-week,Income
16206,13,2,14171,9,2,2,0,0,0,39,1
30642,21,4,2103,12,2,4,0,0,0,59,0
31669,34,2,13636,8,2,6,0,0,0,39,1
14714,12,2,2098,12,2,9,0,0,0,39,1
16378,16,2,10070,8,0,7,4,0,0,39,0
...,...,...,...,...,...,...,...,...,...,...,...
22689,9,2,12975,8,2,13,0,0,0,39,0
30050,2,2,9270,9,4,7,3,0,0,24,0
6832,31,2,17217,9,6,0,1,0,0,39,0
28951,14,2,4345,11,2,0,5,0,37,39,0


In [20]:
x_train_std

array([[-0.63753772, -0.21862106,  0.76573398, ..., -0.20896256,
        -0.0759011 ,  1.75006462],
       [-0.02851408,  1.86775177, -1.36308043, ..., -0.20896256,
         1.61303728, -0.57140747],
       [ 0.96114933, -0.21862106,  0.67135912, ..., -0.20896256,
        -0.0759011 ,  1.75006462],
       ...,
       [ 0.73276547, -0.21862106,  1.30305322, ..., -0.20896256,
        -0.0759011 , -0.57140747],
       [-0.56140976, -0.21862106, -0.96758806, ...,  3.43725609,
        -0.0759011 , -0.57140747],
       [-1.32268931, -0.21862106, -1.39659673, ..., -0.20896256,
         2.37305955, -0.57140747]])

In [21]:
x_test_std

array([[-0.19531734, -0.18911783, -0.92020818, ..., -0.2019794 ,
        -0.07019013, -0.58438809],
       [-1.49276612, -0.18911783,  1.70619963, ..., -0.2019794 ,
        -0.75819858, -0.58438809],
       [-0.19531734, -1.24847569,  0.87290325, ..., -0.2019794 ,
        -0.07019013, -0.58438809],
       ...,
       [ 0.26260576, -0.18911783,  1.11096276, ..., -0.2019794 ,
         3.88585846, -0.58438809],
       [ 1.94165712,  2.98895574, -1.03421595, ..., -0.2019794 ,
         2.07983628, -0.58438809],
       [-0.95852251, -0.18911783, -1.65200768, ..., -0.2019794 ,
         0.78982043, -0.58438809]])

In [22]:
y_test.shape

(9954,)

In [23]:
x_train.shape

(20208, 11)

#### Calculating Accuracy

In [24]:
from sklearn.svm import SVC

In [25]:
models = SVC()
models.fit(x_train_std,y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='rbf', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [26]:
models.score(x_test_std,y_test)

1.0

###### Classification report

In [30]:
y_predicted = models.predict(x_test_std)
y_predicted

array([0, 0, 0, ..., 0, 0, 0])

In [31]:
report = classification_report(y_test,y_predicted)
print(report)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      7420
           1       1.00      1.00      1.00      2534

    accuracy                           1.00      9954
   macro avg       1.00      1.00      1.00      9954
weighted avg       1.00      1.00      1.00      9954



###### Ensemble method

In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [28]:
modelr = RandomForestClassifier(n_estimators=100,max_features=10)
cross_val1 = cross_val_score(modelr,x_test_std,y_test,cv=KFold(n_splits=10,random_state=7))
print(cross_val1.mean())

1.0
