In [1]:
import pandas as pd

In [2]:
train_data = pd.read_csv("Income_Train.csv")

In [3]:
train_data.shape

(209499, 43)

In [4]:
train_data.isnull().sum()

ID                                     0
age                                    0
gender                                 0
education                              0
class                             105245
education_institute               196197
marital_status                         0
race                                   0
is_hispanic                            0
employment_commitment                  0
unemployment_reason               202979
employment_stat                        0
wage_per_hour                          0
is_labor_union                    189420
working_week_per_year                  0
industry_code                          0
industry_code_main                     0
occupation_code                        0
occupation_code_main              105694
total_employed                         0
household_stat                         0
household_summary                      0
under_18_family                   151654
veterans_admin_questionnaire      207415
vet_benefit     

In [5]:
train_data1 = train_data.drop(["ID","education_institute","employment_commitment","household_stat","unemployment_reason","is_labor_union","under_18_family","veterans_admin_questionnaire","old_residence_state","old_residence_reg","migration_prev_sunbelt","migration_code_change_in_msa","migration_code_move_within_reg","occupation_code_main","residence_1_year_ago","country_of_birth_own","country_of_birth_father","country_of_birth_mother","migration_code_change_in_reg"], axis=1)

In [6]:
train_data1.shape

(209499, 24)

In [7]:
pd.set_option("display.max_columns", None)

In [8]:
obj = train_data1.select_dtypes(include= 'object').columns
obj

Index(['gender', 'education', 'class', 'marital_status', 'race', 'is_hispanic',
       'industry_code_main', 'household_summary', 'tax_status', 'citizenship',
       'income_above_limit'],
      dtype='object')

In [9]:
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()

for i in obj:
    train_data1[i]=l.fit_transform(train_data1[i].astype(str))

train_data1.head(2)

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_stat,wage_per_hour,working_week_per_year,industry_code,industry_code_main,occupation_code,total_employed,household_summary,vet_benefit,tax_status,gains,losses,stocks_status,citizenship,mig_year,importance_of_record,income_above_limit
0,79,0,12,8,6,4,0,0,0,52,0,14,0,2,4,2,0,0,0,292,4,95,1779.74,1
1,65,0,12,8,6,4,0,0,0,0,0,14,0,0,4,2,5,0,0,0,4,94,2366.75,1


In [10]:
train_data1["income_above_limit"] = train_data1["income_above_limit"].map({1:0,0:1})

In [11]:
train_data1.head(2)

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_stat,wage_per_hour,working_week_per_year,industry_code,industry_code_main,occupation_code,total_employed,household_summary,vet_benefit,tax_status,gains,losses,stocks_status,citizenship,mig_year,importance_of_record,income_above_limit
0,79,0,12,8,6,4,0,0,0,52,0,14,0,2,4,2,0,0,0,292,4,95,1779.74,0
1,65,0,12,8,6,4,0,0,0,0,0,14,0,0,4,2,5,0,0,0,4,94,2366.75,0


### Model Building:

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from xgboost import XGBClassifier

In [13]:
X = train_data1.drop(["income_above_limit"], axis=1)
y = train_data1["income_above_limit"]

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(167599, 23) (41900, 23) (167599,) (41900,)


In [15]:
from imblearn.under_sampling import RandomUnderSampler
us = RandomUnderSampler(0.1, random_state=0)
X_train_us, y_train_us = us.fit_resample(X_train, y_train)



In [16]:
print(y_train.value_counts())
print(y_train_us.value_counts())

0    157201
1     10398
Name: income_above_limit, dtype: int64
0    103980
1     10398
Name: income_above_limit, dtype: int64


In [17]:
xg_model = XGBClassifier(
                         learning_rate =0.045, 
                         n_estimators=1000, 
                         max_depth=5, 
                         objective= 'binary:logistic',
                         
                         min_child_weight=1, 
                         scale_pos_weight=1.65,
                         gamma=0.2, 
                         subsample=0.9,
                         colsample_bytree=0.3, 
                           
                         nthread=8,
                         seed=20
                        )

xg_model.fit(X_train_us, y_train_us)

y_pred = xg_model.predict(X_test)

cr_xg = classification_report(y_pred, y_test)
print(cr_xg)

print(pd.crosstab(y_pred, y_test))

              precision    recall  f1-score   support

           0       0.97      0.98      0.97     39149
           1       0.65      0.61      0.63      2751

    accuracy                           0.95     41900
   macro avg       0.81      0.79      0.80     41900
weighted avg       0.95      0.95      0.95     41900

income_above_limit      0     1
row_0                          
0                   38231   918
1                    1069  1682


### Test data:

In [18]:
test_data = pd.read_csv("Income_Test.csv")
test_data.shape

(89786, 42)

In [19]:
test_data1 = test_data.drop(["ID","education_institute","employment_commitment","household_stat","unemployment_reason","is_labor_union","under_18_family","veterans_admin_questionnaire","old_residence_state","old_residence_reg","migration_prev_sunbelt","migration_code_change_in_msa","migration_code_move_within_reg","occupation_code_main","residence_1_year_ago","country_of_birth_own","country_of_birth_father","country_of_birth_mother","migration_code_change_in_reg"], axis=1)

In [20]:
from sklearn.preprocessing import LabelEncoder
l = LabelEncoder()

for i in obj[:len(obj)-1]:
    test_data1[i]=l.fit_transform(test_data1[i].astype(str))

test_data1.head(2)

Unnamed: 0,age,gender,education,class,marital_status,race,is_hispanic,employment_stat,wage_per_hour,working_week_per_year,industry_code,industry_code_main,occupation_code,total_employed,household_summary,vet_benefit,tax_status,gains,losses,stocks_status,citizenship,mig_year,importance_of_record
0,54,1,12,3,2,4,0,0,600,46,29,21,38,2,4,2,2,0,0,0,4,94,3388.96
1,53,1,4,3,2,4,1,0,0,52,5,10,37,4,4,2,2,0,0,0,0,95,1177.55


In [21]:
test_pred = xg_model.predict(test_data1)

In [22]:
df = pd.read_csv('Income_SampleSubmission.csv')
df['income_above_limit'] = test_pred
df.to_csv('Income_test_predictions.csv', index=False)
df['income_above_limit'].value_counts(normalize=True)

0    0.934221
1    0.065779
Name: income_above_limit, dtype: float64