# Model 9: Stacking (Ensembles)

## Import the libraries and cleaning of the dataset

In [1]:
### Importing the libraries
import numpy as np
import pandas as pd
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import set_config
from sklearn.linear_model import LogisticRegression

#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config


#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn. preprocessing import StandardScaler


pd.set_option("display.max_columns",None)

In [4]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
train_df.head()

Unnamed: 0,id,occ_code_level2,age,stock_dividends,mig_chg_msa,tax_filer_stat,det_hh_summ,mig_prev_sunbelt,hisp_origin,education,wage_per_hour,capital_losses,vet_question,own_or_self,country_self,mig_move_reg,high_income,hs_college,class_worker,mig_same,unemp_reason,state_prev_res,ind_code_level2,race,country_mother,capital_gains,sex,ind_code_level1,citizenship,union_member,fam_under_18,marital_stat,region_prev_res,mig_chg_reg,country_father,occ_code_level1,full_or_part_emp,weeks_worked,det_hh_fam_stat,num_emp,vet_benefits
0,1,0,42.0,0.0,,Nonfiler,Householder,,All other,11th grade,0.0,,Not in universe,0,United-States,,0,Not in universe,Not in universe,Not in universe under 1 year old,Not in universe,Not in universe,0,Black,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Not in universe,,United-States,Not in universe,Not in labor force,0.0,Householder,0,2
1,2,18,56.0,,,,Householder,,All other,High school graduate,0.0,,Not in universe,2,United-States,,1,Not in universe,Self-employed-incorporated,Not in universe under 1 year old,,Not in universe,32,,United-States,,Male,Wholesale trade,Native- Born in the United States,Not in universe,Not in universe,Married-civilian spouse present,Not in universe,,United-States,Sales,Full-time schedules,,Householder,1,2
2,3,26,26.0,,,Joint both under 65,Householder,,All other,High school graduate,0.0,,Not in universe,0,Haiti,,0,Not in universe,Private,Not in universe under 1 year old,,Not in universe,41,,Haiti,,,Hospital services,Foreign born- Not a citizen of U S,Not in universe,Not in universe,,Not in universe,,Haiti,Adm support including clerical,Full-time schedules,,Householder,3,2
3,4,0,67.0,,MSA to MSA,Joint one under 65 & one 65+,Householder,No,All other,,0.0,,No,0,United-States,Same county,0,Not in universe,Not in universe,No,,North Carolina,0,Black,United-States,0.0,,Not in universe or children,Native- Born in the United States,Not in universe,Not in universe,,Northeast,Same county,United-States,Not in universe,Children or Armed Forces,0.0,Householder,0,1
4,5,0,,,Nonmover,Nonfiler,Child under 18 never married,Not in universe,All other,Children,0.0,0.0,Not in universe,0,United-States,Nonmover,0,Not in universe,Not in universe,Yes,Not in universe,Not in universe,0,White,United-States,0.0,Female,Not in universe or children,Native- Born in the United States,Not in universe,Mother only present,,Not in universe,Nonmover,United-States,Not in universe,Children or Armed Forces,,Child <18 never marr not in subfamily,0,0


In [5]:
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)

In [6]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [7]:
X= train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

In [8]:
# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

## Model

In [15]:
#Neural Network with:- 2 layers of (9,6) neurons- alpha = 0.15- 'adaptive' learning rate- max_iter = 500- activation = relu- solver = 'adam'
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

estimators = [
     ('rf', RandomForestClassifier(n_estimators=2000, random_state=1,max_features=21,min_samples_leaf = 1 )),
    ('NN',MLPClassifier(random_state=1, hidden_layer_sizes = (9,6),alpha=0.15,learning_rate="adaptive",max_iter=500, solver="adam", activation = "relu"))
 ]

clf1 = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor), ("staker", clf1 )]
)


set_config(display="diagram")
clf

In [17]:
cross_val_score(clf, X, y, cv = KFold(n_splits=5, shuffle=True, random_state=1),scoring = "accuracy",n_jobs = -1,verbose=2).mean()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed: 11.4min remaining: 17.1min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 11.7min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed: 11.7min finished


0.8638085619682674

In [18]:
clf.fit(X,y)

[CV] END .................................................... total time=11.4min
[CV] END .................................................... total time=11.4min
[CV] END .................................................... total time=11.5min
[CV] END .................................................... total time=11.6min
[CV] END .................................................... total time=11.7min


## Fit the best model & export the predictions

In [19]:
test_id = test_df[["id"]]

y_test_pred = clf.predict(test_df.drop("id",axis = 1))

In [20]:
y_test_pred[0:5]

array([0, 0, 0, 0, 0])

In [21]:
stacking_rf_nn = test_id
stacking_rf_nn

Unnamed: 0,id
0,1
1,2
2,3
3,4
4,5
...,...
6063,6064
6064,6065
6065,6066
6066,6067


In [22]:
stacking_rf_nn["high_income"] = y_test_pred
stacking_rf_nn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  stacking_rf_nn["high_income"] = y_test_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,0
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [23]:
stacking_rf_nn.to_csv("Predictions/stacking_rf_nn.csv",index = False, header=True)