In [1]:
%reset -fs
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%config InlineBackend.figure_formats = ['svg']  
%matplotlib inline
np.set_printoptions(suppress=True) # Suppress scientific notation where possible
sns.set(context='notebook', style='whitegrid', font_scale=1.2)
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 100)

In [2]:
df = pd.read_csv('income_evaluation.csv', na_values=' ?')

In [3]:
df.head(50)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
5,37,Private,284582,Masters,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,0,0,40,United-States,<=50K
6,49,Private,160187,9th,5,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0,0,16,Jamaica,<=50K
7,52,Self-emp-not-inc,209642,HS-grad,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,45,United-States,>50K
8,31,Private,45781,Masters,14,Never-married,Prof-specialty,Not-in-family,White,Female,14084,0,50,United-States,>50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K


In [4]:
df.isnull().sum()

age                   0
 workclass         1836
 fnlwgt               0
 education            0
 education-num        0
 marital-status       0
 occupation        1843
 relationship         0
 race                 0
 sex                  0
 capital-gain         0
 capital-loss         0
 hours-per-week       0
 native-country     583
 income               0
dtype: int64

In [5]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [6]:
df.columns = [column.strip() for column in df.columns] #reformat

In [7]:
df['native-country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', nan, ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [8]:
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x) #trim space of every cell

In [9]:
df = df.drop(['capital-gain', 'capital-loss','education'], axis=1)

In [10]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,13,United-States,<=50K
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
4,28,Private,338409,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,40,Cuba,<=50K


In [11]:
# df.replace(np.NaN, 999, inplace=True) #change NaN values to ints
# df.head(50)

In [12]:
df.shape
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             32561 non-null  int64 
 1   workclass       30725 non-null  object
 2   fnlwgt          32561 non-null  int64 
 3   education-num   32561 non-null  int64 
 4   marital-status  32561 non-null  object
 5   occupation      30718 non-null  object
 6   relationship    32561 non-null  object
 7   race            32561 non-null  object
 8   sex             32561 non-null  object
 9   hours-per-week  32561 non-null  int64 
 10  native-country  31978 non-null  object
 11  income          32561 non-null  object
dtypes: int64(4), object(8)
memory usage: 3.0+ MB


In [13]:
df = df[(df['hours-per-week'] >= 40) & (df['native-country'] == 'United-States')] #filter make country US and hour-per week >=40

In [14]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,native-country,income
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
5,37,Private,284582,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,United-States,<=50K
7,52,Self-emp-not-inc,209642,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K


In [15]:
df.isnull().sum()

age                 0
workclass         793
fnlwgt              0
education-num       0
marital-status      0
occupation        796
relationship        0
race                0
sex                 0
hours-per-week      0
native-country      0
income              0
dtype: int64

In [16]:
# Drop Nulls 
print("prior shape: ", df.shape)
df.dropna(how='any', inplace=True)
print("post shape: ", df.shape)

prior shape:  (22174, 12)
post shape:  (21378, 12)


In [17]:
#running a loop of value_counts of each column to find out unique values. 
for c in df.columns:
    print ("---- %s ---" % c)
    print (df[c].value_counts())

---- age ---
36    676
35    675
31    659
34    650
33    641
37    641
38    637
39    635
32    621
30    612
28    610
41    607
42    595
29    595
40    595
27    594
43    593
46    564
44    564
25    553
45    548
26    544
47    534
24    497
23    474
51    468
50    464
49    450
48    402
52    358
22    355
53    337
54    302
21    299
55    295
56    270
20    267
59    252
58    251
57    247
60    214
19    190
61    181
62    138
63    117
64     96
18     86
65     71
66     52
67     50
68     42
69     33
17     27
70     25
71     22
90     22
73     15
72     14
75     10
74      8
76      6
77      6
79      5
78      4
83      3
88      3
80      2
84      1
85      1
86      1
82      1
81      1
Name: age, dtype: int64
---- workclass ---
Private             15530
Self-emp-not-inc     1727
Local-gov            1562
Self-emp-inc          873
State-gov             872
Federal-gov           808
Without-pay             6
Name: workclass, dtype: int64
---- fnlwgt 

In [18]:
# df.to_csv('income_evaluation_EDA.csv')

In [19]:
df.rename(columns={'native-country': 'country','hours-per-week': 'hours per week','marital-status': 'marital status', 'education-num': 'education num'}, inplace=True)

In [20]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education num,marital status,occupation,relationship,race,sex,hours per week,country,income
0,39,State-gov,77516,13,Never-married,Adm-clerical,Not-in-family,White,Male,40,United-States,<=50K
2,38,Private,215646,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,40,United-States,<=50K
3,53,Private,234721,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,40,United-States,<=50K
5,37,Private,284582,14,Married-civ-spouse,Exec-managerial,Wife,White,Female,40,United-States,<=50K
7,52,Self-emp-not-inc,209642,9,Married-civ-spouse,Exec-managerial,Husband,White,Male,45,United-States,>50K


In [21]:
#mapping the data into numerical data using map function
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})#.astype(int)

#gender
df['sex'] = df['sex'].map({'Male': 0, 'Female': 1})#.astype(int)

In [22]:
# create dummie variables
df = pd.get_dummies(df, dummy_na=True)
df.head()

Unnamed: 0,age,fnlwgt,education num,sex,hours per week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,workclass_nan,marital status_Divorced,marital status_Married-AF-spouse,marital status_Married-civ-spouse,marital status_Married-spouse-absent,marital status_Never-married,marital status_Separated,marital status_Widowed,marital status_nan,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,relationship_nan,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,race_nan,country_United-States,country_nan
0,39,77516,13,0,40,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
2,38,215646,9,0,40,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,1,0
3,53,234721,7,0,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0
5,37,284582,14,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0
7,52,209642,9,0,45,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0


In [23]:
df.shape

(21378, 52)

In [24]:
df = df.drop(['relationship_nan','race_nan','country_nan', 'marital status_nan'], axis=1) #delete column

In [25]:
df.head()

Unnamed: 0,age,fnlwgt,education num,sex,hours per week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,workclass_nan,marital status_Divorced,marital status_Married-AF-spouse,marital status_Married-civ-spouse,marital status_Married-spouse-absent,marital status_Never-married,marital status_Separated,marital status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,country_United-States
0,39,77516,13,0,40,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
2,38,215646,9,0,40,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
3,53,234721,7,0,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
5,37,284582,14,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
7,52,209642,9,0,45,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1


In [None]:
# df.to_csv('income_evaluation_dummies.csv')

In [26]:
df_X = df.copy()
df_X.drop(['income'], axis=1, inplace=True)
df_X.head()

Unnamed: 0,age,fnlwgt,education num,sex,hours per week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,workclass_nan,marital status_Divorced,marital status_Married-AF-spouse,marital status_Married-civ-spouse,marital status_Married-spouse-absent,marital status_Never-married,marital status_Separated,marital status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,country_United-States
0,39,77516,13,0,40,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
2,38,215646,9,0,40,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
3,53,234721,7,0,40,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
5,37,284582,14,1,40,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
7,52,209642,9,0,45,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1


In [27]:
df.head()

Unnamed: 0,age,fnlwgt,education num,sex,hours per week,income,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,workclass_nan,marital status_Divorced,marital status_Married-AF-spouse,marital status_Married-civ-spouse,marital status_Married-spouse-absent,marital status_Never-married,marital status_Separated,marital status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,country_United-States
0,39,77516,13,0,40,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
2,38,215646,9,0,40,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1
3,53,234721,7,0,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1
5,37,284582,14,1,40,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1
7,52,209642,9,0,45,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1


In [28]:
df_y = df.copy()
df_y = df.iloc[:, 5]
df_y.head()

0    0
2    0
3    0
5    0
7    1
Name: income, dtype: int64

In [29]:
# Train and test split, 20% for final testing
X, X_test, y, y_test = train_test_split(df_X, df_y, test_size=.2, random_state=10)
    
# Train and val split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=.25, random_state=3)

In [30]:
train_df = X_train.copy()
train_df['income'] = y_train
train_df.head()

Unnamed: 0,age,fnlwgt,education num,sex,hours per week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,workclass_nan,marital status_Divorced,marital status_Married-AF-spouse,marital status_Married-civ-spouse,marital status_Married-spouse-absent,marital status_Never-married,marital status_Separated,marital status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,country_United-States,income
12335,63,102412,9,1,40,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
17977,47,140219,9,1,40,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
21457,53,150980,10,0,40,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
13328,49,141058,10,0,50,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
12744,60,243829,14,1,50,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0


In [31]:
pair_plot1 = train_df.copy()
pair_plot1 = pair_plot1.iloc[:, [0,1,2,3,4,-1]]

pair_plot1

Unnamed: 0,age,fnlwgt,education num,sex,hours per week,income
12335,63,102412,9,1,40,0
17977,47,140219,9,1,40,0
21457,53,150980,10,0,40,0
13328,49,141058,10,0,50,0
12744,60,243829,14,1,50,0
...,...,...,...,...,...,...
16415,50,143664,13,0,40,0
25269,36,171968,9,0,40,0
8758,29,243660,9,0,50,1
235,59,268700,9,0,40,0


In [34]:
# sns.pairplot(pair_plot1, hue='income')
# plt.savefig('pair plot1.png', dpi=300);

In [36]:
# sns.boxplot(x='income', y='education num', data=pair_plot1)
# plt.savefig('boxplot income_education num.png', dpi=300);

In [37]:
lm1 = LogisticRegression(C=1000, max_iter=500) # setting C very high essentially removes regularization
lm1.fit(X_train[['education num']], y_train)

LogisticRegression(C=1000, max_iter=500)

In [38]:
print('Score on train:')
lm1.score(X_train[['education num']], y_train)

Score on train:


0.7303913924840169

In [39]:
lm1.fit(X_val[['education num']], y_val)
print('Score on val:')
lm1.score(X_val[['education num']], y_val)

Score on val:


0.725912067352666

In [44]:
# sns.boxplot(x='income', y='age', data=pair_plot1)
# plt.savefig('boxplot income_age.png', dpi=300);

In [46]:
# sns.boxplot(x='income', y='sex', data=pair_plot1)
# plt.savefig('boxplot income_sex.png', dpi=300);

In [47]:
lm1.fit(X_val[['education num','age']], y_val)
lm1.score(X_val[['education num','age']], y_val)

0.7453227315247896

In [48]:
lm1.fit(X_val[['education num','age','sex']], y_val)
print('Score w/ education num, age, sex:')
lm1.score(X_val[['education num','age','sex']], y_val)

Score w/ education num, age, sex:


0.7628624883068288

In [51]:
# sns.histplot(data=pair_plot1, x="hours per week", hue="income")
# plt.savefig('hist hours per week.png', dpi=300);

In [52]:
lm1.fit(X_val[['education num','age','sex','hours per week']], y_val)
print('Score w/ education num, age, sex, hours per week:')
lm1.score(X_val[['education num','age','sex','hours per week']], y_val) #don't use hours per week

Score w/ education num, age, sex, hours per week:


0.7619270346117867

In [53]:
pair_plot2 = train_df.copy()
pair_plot2 = pair_plot2.iloc[:,-7:]
pair_plot2 = pair_plot2.drop(['country_United-States'], axis=1)
pair_plot2

Unnamed: 0,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,income
12335,0,0,0,0,1,0
17977,0,0,0,0,1,0
21457,0,0,0,0,1,0
13328,0,0,0,0,1,0
12744,0,0,0,0,1,0
...,...,...,...,...,...,...
16415,0,0,0,0,1,0
25269,0,0,0,0,1,0
8758,0,0,0,0,1,1
235,0,0,0,0,1,0


In [55]:
# sns.pairplot(pair_plot2, hue='income')
# plt.savefig('pairplot2.png', dpi=300);

In [56]:
pair_plot2.head()

Unnamed: 0,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,income
12335,0,0,0,0,1,0
17977,0,0,0,0,1,0
21457,0,0,0,0,1,0
13328,0,0,0,0,1,0
12744,0,0,0,0,1,0


In [57]:
lm1.fit(X_val[['education num','age','sex','race_White','race_Amer-Indian-Eskimo','race_Asian-Pac-Islander','race_Black','race_Other']], y_val)
print('Score w/ all race:')
lm1.score(X_val[['education num','age','sex','race_White','race_Amer-Indian-Eskimo','race_Asian-Pac-Islander','race_Black','race_Other']], y_val)

Score w/ all race:


0.7616931711880262

In [58]:
train_df.head()

Unnamed: 0,age,fnlwgt,education num,sex,hours per week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,workclass_State-gov,workclass_Without-pay,workclass_nan,marital status_Divorced,marital status_Married-AF-spouse,marital status_Married-civ-spouse,marital status_Married-spouse-absent,marital status_Never-married,marital status_Separated,marital status_Widowed,occupation_Adm-clerical,occupation_Armed-Forces,occupation_Craft-repair,occupation_Exec-managerial,occupation_Farming-fishing,occupation_Handlers-cleaners,occupation_Machine-op-inspct,occupation_Other-service,occupation_Priv-house-serv,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,occupation_nan,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,country_United-States,income
12335,63,102412,9,1,40,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
17977,47,140219,9,1,40,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,0
21457,53,150980,10,0,40,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,1,0
13328,49,141058,10,0,50,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0
12744,60,243829,14,1,50,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0


In [59]:
pair_plot3 = train_df.copy()
pair_plot3 = pair_plot3.iloc[:,-13:]
pair_plot3 = pair_plot3.drop(['country_United-States','race_White','race_Amer-Indian-Eskimo','race_Asian-Pac-Islander','race_Black','race_Other'], axis=1)
pair_plot3

Unnamed: 0,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,income
12335,0,1,0,0,0,0,0
17977,0,0,1,0,0,0,0
21457,1,0,0,0,0,0,0
13328,0,1,0,0,0,0,0
12744,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...
16415,0,1,0,0,0,0,0
25269,1,0,0,0,0,0,0
8758,1,0,0,0,0,0,1
235,1,0,0,0,0,0,0


In [61]:
# sns.pairplot(pair_plot3, hue='income')
# plt.savefig('pairplot3.png', dpi=300);

In [64]:
# hist3 = pair_plot3.copy()
# hist3 = hist3.drop(['income'], axis=1)
# sns.histplot(data=hist3, multiple="stack")
# plt.savefig('hist income_relationship', dpi=300);