# Model Building For Deployment

In [1]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from pickle import dump

# Display all columns
pd.set_option('max_columns', None)

In [2]:
# Importing dataset
train_data = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data',header=None)

# Columns name we can find from data description 
col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status','occupation','relationship', 
              'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week', 'native_country', 'income']

train_data.columns = col_labels

In [3]:
# Making sure that the data does not contain unecessary spaces.
train_data=train_data.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

# Before any analysis, let's convert the target column into numerical classes.
train_data['income'] = train_data['income'].replace({'<=50K':0,'>50K':1})

In [4]:
# As the columns which have missing values, they are only categorical, we'll use the categorical imputer
from sklearn_pandas import CategoricalImputer
imputer = CategoricalImputer()

# Imputing the missing values
train_data['workclass'] = imputer.fit_transform(train_data['workclass'])
train_data['occupation'] = imputer.fit_transform(train_data['occupation'])
train_data['native_country'] = imputer.fit_transform(train_data['native_country'])

In [5]:
# Transforming Maritial Status column with value as either married or not married.
train_data = train_data.replace({'Married-civ-spouse':'married','Married-AF-spouse':'married','Married-spouse-absent':'married',
                    'Never-married':'not married','Divorced':'not married','Separated':'not married','Widowed':'not married'})

# Transforming native country column with value as either United States or Other.
countries = train_data['native_country'].unique()[1:]
train_data['native_country'] = train_data['native_country'].replace(to_replace=countries, value='Other')

In [6]:
# Droping those feature which we are not going to use
clean_data = train_data.drop(['workclass', 'fnlwgt', 'occupation', 'relationship', 'race', 'education_num'], axis=1)

In [7]:
# Creating dummies variables
dummie_data= pd.get_dummies(clean_data, drop_first=True)

In [8]:
# Declaring dependent and independent variable
X = dummie_data.drop(['income'],axis=1)
y = dummie_data['income']

In [9]:
# feature scaling
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X[['age', 'capital_gain', 'capital_loss', 'hours_per_week']])
input_scaled = scaler.transform(X[['age', 'capital_gain', 'capital_loss', 'hours_per_week']])
scaled_data = pd.DataFrame(input_scaled,columns=['age','hours_per_week','capital_gain','capital_loss'])
X_scaled = scaled_data.join(X.drop(['age', 'hours_per_week','capital_gain','capital_loss'],axis=1))

StandardScaler()

In [11]:
# Fixing imbalanced dataset by oversampling
from imblearn.over_sampling import RandomOverSampler
rd_over_sample = RandomOverSampler()
X_sampled, y_sampled = rd_over_sample.fit_sample(X_scaled,y)

In [12]:
# Split data into train and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X_sampled, y_sampled, test_size=0.2, random_state=0)

In [13]:
# Model Building
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=0,min_child_weight=3,max_depth= 15,
                    learning_rate = 0.1,gamma= 0.1,
                    colsample_bytree= 0.7)

# Training Model
xgb.fit(x_train.values, y_train.values)

# Mode summary
y_pred_xgb = xgb.predict(x_test.values)
y_pred_train_xgb = xgb.predict(x_train.values)

In [14]:
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
print("-----Test Data Accuracy----")
print('XGB Classifier:')
print('Accuracy score:',round(accuracy_score(y_test, y_pred_xgb), 4))
print('F1 score:',round(f1_score(y_test, y_pred_xgb), 4))

print("\n-----Train Data Accuracy----")
print('Accuracy score:',round(accuracy_score(y_train, y_pred_train_xgb), 4))
print('F1 score:',round(f1_score(y_train, y_pred_train_xgb), 4))

-----Test Data Accuracy----
XGB Classifier:
Accuracy score: 0.8572
F1 score: 0.8626

-----Train Data Accuracy----
Accuracy score: 0.8725
F1 score: 0.879


# Save Model & Data Scaler to reuse it again

In [91]:
'''
# Save Model and Data Scaler
from pickle import dump
from pickle import load

# save the model
dump(xgb, open('xgb_model.pkl', 'wb'))

# save the scaler
dump(scaler, open('scaler.pkl', 'wb'))
'''

# Load Model and Data Scaler

In [8]:
'''
import pickle
# load the model
model = pickle.load(open('xgb_model.pkl', 'rb'))
# load the scaler
scalers = pickle.load(open('scaler.pkl', 'rb'))
'''

# Test the Model by giving input

In [5]:
# Predictin function
def prediction(age, capital_gain, capital_loss, hours_per_week, education, sex, marital_status, country):
    age = int(age)
    hours = int(hours_per_week)
    gain = int(capital_gain)
    loss = int(capital_loss)
    sex = 1 if sex=='Male' else 0
    marital = 0 if marital_status=='Married' else 1
    country = 1 if country=='United States' else 0
    
    # if condition match only assighn '1' to that variable 
    _11th     = 1 if education=='_11th' else 0
    _12th     = 1 if education=='_12th' else 0
    _1st_4th  = 1 if education=='_1st_4th' else 0
    _5th_6th  = 1 if education=='_5th_6th' else 0
    _7th_8th  = 1 if education=='_7th_8th' else 0
    _9th      = 1 if education=='_9th' else 0
    _Assoc_acdm = 1 if education=='Assoc_acdm' else 0
    assoc_voc = 1 if education=='assoc_voc' else 0
    bachelors = 1 if education=='bachelors' else 0
    doctorate = 1 if education=='doctorate' else 0
    HS_grad = 1 if education=='HS_grad' else 0
    masters = 1 if education=='masters' else 0
    preschool = 1 if education=='preschool' else 0
    prof_school = 1 if education=='prof_school' else 0
    college = 1 if education=='college' else 0
    
    # load the model
    model = load(open('xgb_model.pkl', 'rb'))
    # load the scaler
    scaler = load(open('scaler.pkl', 'rb'))
        
    # feature scaling on age,capital_gain, capital_loss, hours per week
    scaled_value = scaler.transform([[age,gain,loss,hours]])
    age, gain, loss, hours = scaled_value[0,0],scaled_value[0,1],scaled_value[0,2],scaled_value[0,3]
    
    '''print([age, hours, gain, loss, _11th, _12th, _1st_4th, _5th_6th, _7th_8th,_9th,_Assoc_acdm,
                  assoc_voc, bachelors,doctorate, HS_grad, masters, preschool, prof_school, college,
                  marital, sex, country])'''
    return model.predict(np.array([[age, hours, gain, loss, _11th, _12th, _1st_4th, _5th_6th, _7th_8th,_9th,_Assoc_acdm,
                  assoc_voc, bachelors,doctorate, HS_grad, masters, preschool, prof_school, college,
                  marital, sex, country]]))[0]


In [7]:
prediction(71,0,0,20,'doctorate','Male','Married','United States')

1

In [8]:
# age, capital_gain, capital_loss, hours_per_week, education, sex, marital_status, country
prediction(53,0,0, 40,'masters','Male','Married','United States')

0

In [9]:
prediction(39,0,0,10,'Assoc_acdm','feMale','Married','United States')

1

In [7]:
prediction(39,0,0,10,'Assoc_acdm','feMale','Married','United States')

1