In [5]:
import pandas as pd
import seaborn as sns
import numpy as np
import pickle

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.compose import make_column_selector
from sklearn.ensemble import RandomForestClassifier




In [7]:
dataset = pd.read_csv('data/salary.csv')

In [8]:
df = dataset.copy()

In [28]:
df['native-country'].unique()

array([' United-States', ' Cuba', ' Jamaica', ' India', ' ?', ' Mexico',
       ' South', ' Puerto-Rico', ' Honduras', ' England', ' Canada',
       ' Germany', ' Iran', ' Philippines', ' Italy', ' Poland',
       ' Columbia', ' Cambodia', ' Thailand', ' Ecuador', ' Laos',
       ' Taiwan', ' Haiti', ' Portugal', ' Dominican-Republic',
       ' El-Salvador', ' France', ' Guatemala', ' China', ' Japan',
       ' Yugoslavia', ' Peru', ' Outlying-US(Guam-USVI-etc)', ' Scotland',
       ' Trinadad&Tobago', ' Greece', ' Nicaragua', ' Vietnam', ' Hong',
       ' Ireland', ' Hungary', ' Holand-Netherlands'], dtype=object)

In [5]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
dropped_columns = ['fnlwgt', 'education-num', 'capital-gain', 'capital-loss']

In [61]:
df.drop(columns= dropped_columns, inplace=True)

In [62]:
df.columns

Index(['age', 'workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [8]:
salary = {' <=50K': 1,' >50K': 0}
df['salary'] = [salary[item] for item in df['salary']]

In [9]:
df.salary.value_counts()

1    24720
0     7841
Name: salary, dtype: int64

In [10]:
X = df.drop('salary', axis=1)
y = df['salary']

In [56]:
print(X[1:2])

        0         1         2        3        4         5    6    7    8    \
1  0.837109 -1.008707  1.134739 -0.14592 -0.21666 -2.222153  0.0  0.0  0.0   

   9    ...  98   99   100  101  102  103  104  105  106  107  
1  0.0  ...  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  

[1 rows x 108 columns]


In [11]:
preprocessor = ColumnTransformer([
    ('num_encoder', StandardScaler(), make_column_selector(dtype_include="int64")),
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False), make_column_selector(dtype_include="object"))
    ],remainder='passthrough')


preprocessor

In [12]:
X = pd.DataFrame(preprocessor.fit_transform(X))



In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

5514    -0.409205
19777   -0.189267
10781    1.423610
32240   -1.288956
9876    -0.849080
           ...   
29802    0.617171
5390    -0.555830
860     -1.508894
15795    0.837109
23654   -0.335892
Name: 0, Length: 26048, dtype: float64

In [44]:
param_grid = {'criterion': ['gini', 'entropy', 'log_loss'],
              'max_features': ['auto'],
              'max_depth': [2, 10, 20, 30, 50, 80, 100],
              'n_estimators': [10, 17, 25, 33, 41, 48, 56, 64, 72, 80],
              }

In [45]:
#rf = RandomForestClassifier()

# Use RandomizedSearchCV to tune the hyperparameters
#random_search = RandomizedSearchCV(rf, param_grid, cv=5, n_iter=50, random_state=42, verbose=1, n_jobs = -1)
#random_search.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


In [46]:
random_search.best_estimator_

In [47]:
pickle.dump(random_search.best_estimator_, open("rf_class_model.pickle", "wb"))

In [48]:
with open('rf_class_model.pickle', 'rb') as f:
    model = pickle.load(f)
    
model

In [49]:
y_pred = model.predict(X_test)
y_pred

array([1, 1, 0, ..., 0, 1, 1])

In [50]:
print(accuracy_score(y_test, y_pred))

0.8648856133886074
