# Model 8.2. Neural Network (TensorFlow/Keras)

## Importing libraries

In [2]:
### Importing the libraries
import numpy as np
import pandas as pd
#from sklearnex import patch_sklearn

#patch_sklearn()

#plotting lib
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns


#Sklearn Lib metrics
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV, KFold

# Pipelines : 
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn import set_config

#GradientBoostingClassifier and AdaboostClassifier
import tensorflow as tf
from tensorflow import keras

#Missing values : 
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import KNNImputer, IterativeImputer, SimpleImputer
import missingno as msno

#Dummy
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import train_test_split


pd.set_option("display.max_columns",None)

## Importing and cleaning the data : 

In [261]:
train_df = pd.read_csv("Data/train.csv")
test_df = pd.read_csv("Data/test.csv")
#removing id for train_df
train_df.drop("id", axis = 1, inplace = True)
test_id = test_df[["id"]]
test_df.drop("id", axis = 1, inplace = True)

In [4]:
# Transforming object into categories 
# for train
for i in train_df.columns:
    if train_df[i].dtypes == "object":
        train_df[i] = train_df[i].astype("category")
# For test
for i in test_df.columns:
    if test_df[i].dtypes == "object":
        test_df[i] = test_df[i].astype("category")

In [10]:
X = train_df.drop("high_income", axis = 1)
y = train_df["high_income"]

# Grouping our features that are categories in one vector
# Same for numeric

categorical_features = [i for i in X.columns if X[i].dtype.name == "category"]
numerical_features = [i for i in X.columns if X[i].dtype.name != "category"]

## Splitting the data into a train and a validation set

In [143]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,test_size=0.2, random_state=1, stratify=y)
y_train

2110     0
44057    0
37761    0
44201    1
31675    0
        ..
30863    0
20221    1
35500    0
40565    0
44910    0
Name: high_income, Length: 43685, dtype: int64

## Creating a pipeline in order to process the data 

In [144]:
#Pipeline to process the data before 
numeric_transformer = Pipeline(
    steps=[("imputer_num", SimpleImputer(strategy="mean")),
           ("scaler", StandardScaler())]
)

categorical_transformer = Pipeline(
    steps=[("imputer_cat", SimpleImputer(strategy="constant", fill_value="Missing")),
           
           ("encoder" , OneHotEncoder(handle_unknown="ignore", sparse=False))]
)



preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numerical_features),
        ("cat", categorical_transformer, categorical_features),
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[("preprocessor", preprocessor)]
)


set_config(display="diagram")
clf

In [264]:
new_x_train = clf.fit_transform(X_train)
new_x_valid = clf.transform(X_valid)
new_x_test = clf.transform(test_df)
new_x_train

array([[-8.27689714e-01, -2.02540599e+00, -1.73594115e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.31482797e-01,  3.95946317e-01, -1.73594115e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 8.31482797e-01,  0.00000000e+00, -2.38474242e-17, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00],
       ...,
       [-8.27689714e-01,  0.00000000e+00, -1.73594115e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [-8.27689714e-01,  1.22430368e+00, -1.73594115e-01, ...,
         0.00000000e+00,  0.00000000e+00,  0.00000000e+00],
       [ 9.03620732e-01,  0.00000000e+00, -1.73594115e-01, ...,
         0.00000000e+00,  0.00000000e+00,  1.00000000e+00]])

In [146]:
print(new_x_train.shape)
print(new_x_valid.shape)

(43685, 408)
(10922, 408)


## Running some Neural net models using Keras

In [147]:
model = keras.models.Sequential([
    keras.layers.Dense(units=300, activation="relu"),
    keras.layers.Dense(units=100, activation="relu"),
    keras.layers.Dense(units=2,  activation="softmax")
])

model.compile(loss = "sparse_categorical_crossentropy", # as y is encoded as a vector of labels and not as a one hot matrix
              optimizer = "sgd",
              metrics = ["accuracy"])


In [148]:
history = model.fit(x=new_x_train, y=y_train,
                    validation_data=(new_x_valid, y_valid),
                    batch_size = 64,
                    epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Another attempt : 

In [149]:
model = keras.models.Sequential([
    keras.layers.Dense(units=300, activation="relu"),
    keras.layers.Dense(units=50, activation="relu"),
    keras.layers.Dense(units=2,  activation="softmax")
])

model.compile(loss = "sparse_categorical_crossentropy", # as y is encoded as a vector of labels and not as a one hot matrix
              optimizer = "sgd",
              metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, verbose=1)


In [150]:
history = model.fit(x=new_x_train, y=y_train,
                    validation_data=(new_x_valid, y_valid),
                    batch_size = 64,
                    epochs=20,callbacks=[early_stopping_cb])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [151]:
y_pred = np.argmax(model.predict(new_x_valid), axis = 1)



In [152]:
y_pred

array([1, 0, 0, ..., 0, 0, 1])

In [153]:
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, y_pred)


0.8597326496978576

# New test by changing the learning rate : 

In [256]:

tf.random.set_seed(1)


from keras.optimizers import Adam
model = keras.models.Sequential([
    keras.layers.Dense(units=300, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units=300, activation="relu"),
    keras.layers.Dropout(0.2),
    keras.layers.Dense(units=50, activation="relu"),
    keras.layers.Dense(units=2,  activation="softmax")
    
])



model.compile(Adam(lr=0.001),loss = "sparse_categorical_crossentropy", # as y is encoded as a vector of labels and not as a one hot matrix
              
              metrics = ["accuracy"])
early_stopping_cb = keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, verbose=1)

history = model.fit(x=new_x_train, y=y_train,
                    validation_data=(new_x_valid, y_valid),
                    batch_size = 64,
                    epochs=20,callbacks=[early_stopping_cb])


  super(Adam, self).__init__(name, **kwargs)


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 10: early stopping


In [257]:
y_pred = np.argmax(model.predict(new_x_valid), axis = 1)
from sklearn.metrics import accuracy_score
accuracy_score(y_valid, y_pred)



0.8620216077641457

64 mini batch, 300 --> 0.2 --> 300 --> 0.2 --> 50 -->2 = 0.8620216077641457 best so far ! 

## Trying to publish this model

In [265]:
NN_keras = test_id
y_pred = np.argmax(model.predict(new_x_test), axis = 1)
NN_keras["high_income"] = y_pred
NN_keras




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  NN_keras["high_income"] = y_pred


Unnamed: 0,id,high_income
0,1,0
1,2,0
2,3,0
3,4,1
4,5,0
...,...,...
6063,6064,0
6064,6065,1
6065,6066,0
6066,6067,0


In [266]:
NN_keras.to_csv("Predictions/NN_keras.csv",index = False, header=True)