<a href="https://colab.research.google.com/github/halfbakedem/Projects-in-advanced-ML/blob/main/ML_HW1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setting up and preprocessing data 

In [1]:
#install aimodelshare library
! pip install aimodelshare --upgrade

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting aimodelshare
  Downloading aimodelshare-0.0.178-py3-none-any.whl (966 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m967.0/967.0 KB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting botocore==1.24.20
  Downloading botocore-1.24.20-py3-none-any.whl (8.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m23.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-resources==5.10.0
  Downloading importlib_resources-5.10.0-py3-none-any.whl (34 kB)
Collecting onnxruntime>=1.7.0
  Downloading onnxruntime-1.13.1-cp38-cp38-manylinux_2_27_x86_64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting wget==3.2
  Downloading wget-3.2.zip (10 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3==1.21.20
  Dow

In [2]:
import os
from google.colab import drive
drive.mount('/content/drive')

os.chdir('/content/drive/My Drive/advanced ML')


Mounted at /content/drive


In [59]:
#Get competition data from course folder and unzip
# importing the zipfile module
from zipfile import ZipFile
  
# loading the temp.zip and creating a zip object
with ZipFile("world_happiness_competition_data.zip", 'r') as zObject:
  
    # Extracting all the members of the zip 
    # into a specific location.
    zObject.extractall()

In [60]:
# Load data
import pandas as pd
X_train = pd.read_csv('world_happiness_competition_data/X_train.csv')
X_test = pd.read_csv('world_happiness_competition_data/X_test.csv')
y_train = pd.read_csv('world_happiness_competition_data/y_train.csv')
y_train_labels = y_train.idxmax(axis=1)

X_train.head()

Unnamed: 0,Country or region,GDP per capita,Social support,Healthy life expectancy,Freedom to make life choices,Generosity,Perceptions of corruption,name,region,sub-region,Terrorist_attacks
0,Peru,0.96,1.274,0.854,0.455,0.083,0.027,Peru,Americas,Latin America and the Caribbean,18.0
1,Nicaragua,0.694,1.325,0.835,0.435,0.2,0.127,Nicaragua,Americas,Latin America and the Caribbean,125.611111
2,Greece,1.181,1.156,0.999,0.067,0.0,0.034,Greece,Europe,Southern Europe,112.0
3,Qatar,1.684,1.313,0.871,0.555,0.22,0.167,Qatar,Asia,Western Asia,57.333333
4,Uzbekistan,0.745,1.529,0.756,0.631,0.322,0.24,Uzbekistan,Asia,Central Asia,125.611111


In [61]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# We create the preprocessing pipelines for both numeric and categorical data.

numeric_features = X_train.drop(['Country or region', 'name', 'region', 'sub-region'], axis=1)
numeric_features=numeric_features.columns.tolist()

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_features = ['region', 'sub-region']

#Replacing missing values with Modal value and then one hot encoding.
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# final preprocessor object set up with ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

#Fit your preprocessor object
preprocess=preprocessor.fit(X_train) 

In [62]:
# Write function to transform data with preprocessor

def preprocessor(data):
    data.drop(['Country or region', 'name'], axis=1)
    preprocessed_data=preprocess.transform(data)
    return preprocessed_data

In [63]:
#Save preprocessor
import aimodelshare as ai

ai.export_preprocessor(preprocessor,"") 

Your preprocessor is now saved to 'preprocessor.zip'


Set up modelshare.org

In [64]:
#Set credentials using modelshare.org username/password

from aimodelshare.aws import set_credentials

#This is the unique rest api that powers this World Happiness Classification Playground -- make sure to update the apiurl for new competition deployments
apiurl="https://e2w6gh3id1.execute-api.us-east-2.amazonaws.com/prod/m"

set_credentials(apiurl=apiurl)

AI Modelshare Username:··········
AI Modelshare Password:··········
AI Model Share login credentials set successfully.


Model 1: Random Forest Classifier

In [65]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid1 = {'n_estimators': np.arange(25, 200, 25),'max_depth':[1, 3, 5, 7]} 

model1 = GridSearchCV(RandomForestClassifier(), param_grid=param_grid1, cv=10)

#use meta model methods to fit score and predict model:
model1.fit(preprocessor(X_train), y_train_labels)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(model1.best_score_))
print("best parameters: {}".format(model1.best_params_))


best mean cross-validation score: 0.704
best parameters: {'max_depth': 7, 'n_estimators': 75}


In [11]:
# Save sklearn model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

# Check how many preprocessed input features are there?
from skl2onnx.common.data_types import FloatTensorType

feature_count=preprocessor(X_test).shape[1] #Get count of preprocessed features
initial_type = [('float_input', FloatTensorType([None, feature_count]))]  #Insert correct number of preprocessed features

onnx_model = model_to_onnx(model1, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [12]:
#Instantiate Competition
import aimodelshare as ai
mycompetition= ai.Competition(apiurl)

In [13]:
#Submit Model 1: 

#-- Generate predicted values (Model 1)
prediction_labels = model1.predict(preprocessor(X_test))

# Submit Model 1 to Competition Leaderboard
mycompetition.submit_model(model_filepath = "model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels,
                           custom_metadata={"team":"5"})

Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 

Your model has been submitted as model version 261

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:3164


Model 2: Gradient Boosting Classifier

In [28]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np


param_grid = {'n_estimators': np.arange(50, 300, 25), 'learning_rate':[0.6,0.8,1], 'max_depth':[1, 3, 5]} #np.arange creates sequence of numbers for each k value

model2 = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid, cv=10)

#use meta model methods to fit score and predict model:
model2.fit(preprocessor(X_train), y_train_labels)

#extract best score and parameter by calling objects "best_score_" and "best_params_"
print("best mean cross-validation score: {:.3f}".format(model2.best_score_))
print("best parameters: {}".format(model2.best_params_))


best mean cross-validation score: 0.675
best parameters: {'learning_rate': 1, 'max_depth': 5, 'n_estimators': 200}


In [32]:
# Save sklearn model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

# Check how many preprocessed input features are there?
from skl2onnx.common.data_types import FloatTensorType

feature_count=preprocessor(X_test).shape[1] #Get count of preprocessed features
initial_type = [('float_input', FloatTensorType([None, feature_count]))]  # Insert correct number of preprocessed features

onnx_model = model_to_onnx(model2, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

#-- Generate predicted values (a list of predicted labels "real" or "fake")
prediction_labels = model2.predict(preprocessor(X_test))

# Submit model to Competition Leaderboard
mycompetition.submit_model(model_filepath = "model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels,
                           custom_metadata={"team":"5"})

Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 

Your model has been submitted as model version 208

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:3164


Model 3: Tensor flow

In [52]:
# Import relevant modules from TensorFlow
import tensorflow as tf
from tensorflow.keras import layers

# define the model architecture
model3 = tf.keras.Sequential()

# add the first convolutional layer with 16 filters and 'relu' activation
model3.add(tf.keras.layers.Conv2D(16, (4, 4), activation='relu'))

# add max pooling with a 2 by 2 filter
model3.add(tf.keras.layers.MaxPooling2D((4, 4)))

# add the second convolutional layer with 28 filters and 'relu' activation
model3.add(tf.keras.layers.Conv2D(28, (3, 3), activation='relu'))

# add max pooling with a 2 by 2 filter
model3.add(tf.keras.layers.MaxPooling2D((4, 4)))

# add the output layer with 'softmax' activation
model3.add(tf.keras.layers.Dense(5, activation='softmax'))

# compile the model using 'sgd' optimizer
model3.compile(optimizer='sgd', loss='categorical_crossentropy', metrics=['accuracy'])

In [66]:
# Import relevant modules from TensorFlow
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import SGD


model3 = Sequential([
    Dense(32, input_dim=(26)),
    Activation('relu'),
    Dense(32),
    Activation('relu'),
    Dense(32),
    Activation('relu'),
    Dense(5),
    Activation('softmax'),
])

# compile the model using 'sgd' optimizer
model3.compile(optimizer='sgd', loss="categorical_crossentropy", metrics=['accuracy'])

In [97]:
# fitting the model
import numpy as np
tf.config.run_functions_eagerly(True)
model3.fit(preprocessor(X_train), y_train, validation_split=0.25, epochs=200, batch_size=10)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.callbacks.History at 0x7fa774da4670>

In [99]:
# Save keras model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

feature_count=preprocessor(X_test).shape[1] #Get count of preprocessed features
initial_type = [('float_input', FloatTensorType([None, feature_count]))]  # Insert correct number of preprocessed features

onnx_model = model_to_onnx(model3, framework='keras',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=True)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

#-- Generate predicted values (a list of predicted labels "real" or "fake")
prediction_labels=model3.predict(preprocessor(X_test))
prediction_labels=np.argmax(prediction_labels,axis=-1)
labels = {0: 'Average', 1: 'High', 2: 'Low', 3: 'Very High', 4: 'Very Low'}
prediction_labels = [labels[i] for i in prediction_labels]


# Submit model to Competition Leaderboard
mycompetition.submit_model(model_filepath = "model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels,
                           custom_metadata={"team":"5"})

Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 

Your model has been submitted as model version 267

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:3164


#Part 2: Improving place on leaderboard!

Model 4: Random Forest

Using Anne's parameters to improve model.

In [109]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np

model4 = RandomForestClassifier(n_estimators=200, max_depth=7)

#use meta model methods to fit score and predict model:
model4.fit(preprocessor(X_train), y_train_labels)

RandomForestClassifier(max_depth=7, n_estimators=200)

In [110]:
# Save sklearn model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

# Check how many preprocessed input features are there?
from skl2onnx.common.data_types import FloatTensorType

feature_count=preprocessor(X_test).shape[1] #Get count of preprocessed features
initial_type = [('float_input', FloatTensorType([None, feature_count]))]  #Insert correct number of preprocessed features

onnx_model = model_to_onnx(model4, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

In [111]:
#Submit Model 4: 

#-- Generate predicted values (Model 4)
prediction_labels = model4.predict(preprocessor(X_test))

# Submit Model 4 to Competition Leaderboard
mycompetition.submit_model(model_filepath = "model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels,
                           custom_metadata={"team":"5"})

##F1: 0.49, model version 271; did not improve:(

Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 

Your model has been submitted as model version 271

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:3164


Model 5: improved Gradient Boost

Using Oliver's parameters for gradient boosting to improve my f1 score.

In [106]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import numpy as np



model5 = GradientBoostingClassifier(n_estimators=52, learning_rate=1.115, max_depth=1)

#use meta model methods to fit score and predict model:
model5.fit(preprocessor(X_train), y_train_labels)


GradientBoostingClassifier(learning_rate=1.115, max_depth=1, n_estimators=52)

In [107]:
# Save sklearn model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

# Check how many preprocessed input features are there?
from skl2onnx.common.data_types import FloatTensorType

feature_count=preprocessor(X_test).shape[1] #Get count of preprocessed features
initial_type = [('float_input', FloatTensorType([None, feature_count]))]  # Insert correct number of preprocessed features

onnx_model = model_to_onnx(model5, framework='sklearn',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=False)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

#-- Generate predicted values (a list of predicted labels "real" or "fake")
prediction_labels = model5.predict(preprocessor(X_test))

# Submit model to Competition Leaderboard
mycompetition.submit_model(model_filepath = "model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels,
                           custom_metadata={"team":"5"})

##F1: 0.487, model version 270; improved:)

Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 

Your model has been submitted as model version 269

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:3164


Model 6: Neural Network

None of my teammates attempted a 3-layer NN. I think that continuing down the path of 3-layer NN is the right one as my model generated the highest F1 score amongst my teammates. However, I did learn that I could perhaps increase the epochs and batch size.

In [113]:
# Import relevant modules from TensorFlow
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.optimizers import SGD


model6 = Sequential([
    Dense(32, input_dim=(26)),
    Activation('relu'),
    Dense(32),
    Activation('relu'),
    Dense(32),
    Activation('relu'),
    Dense(5),
    Activation('softmax'),
])

# compile the model using 'sgd' optimizer
model6.compile(optimizer='sgd', loss="categorical_crossentropy", metrics=['accuracy'])

In [115]:
# fitting the model
import numpy as np
tf.config.run_functions_eagerly(True)
model6.fit(preprocessor(X_train), y_train, validation_split=0.25, epochs=300, batch_size=30)

Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300
Epoch 75/300
Epoch 76/300
Epoch 77/300
Epoch 78

<keras.callbacks.History at 0x7fa77522f760>

In [116]:
# Save keras model to local ONNX file
from aimodelshare.aimsonnx import model_to_onnx

feature_count=preprocessor(X_test).shape[1] #Get count of preprocessed features
initial_type = [('float_input', FloatTensorType([None, feature_count]))]  # Insert correct number of preprocessed features

onnx_model = model_to_onnx(model6, framework='keras',
                          initial_types=initial_type,
                          transfer_learning=False,
                          deep_learning=True)

with open("model.onnx", "wb") as f:
    f.write(onnx_model.SerializeToString())

#-- Generate predicted values (a list of predicted labels "real" or "fake")
prediction_labels=model6.predict(preprocessor(X_test))
prediction_labels=np.argmax(prediction_labels,axis=-1)
labels = {0: 'Average', 1: 'High', 2: 'Low', 3: 'Very High', 4: 'Very Low'}
prediction_labels = [labels[i] for i in prediction_labels]


# Submit model to Competition Leaderboard
mycompetition.submit_model(model_filepath = "model.onnx",
                                 preprocessor_filepath="preprocessor.zip",
                                 prediction_submission=prediction_labels,
                           custom_metadata={"team":"5"})

#F1: 0.482, model version 272; improved!!!

Insert search tags to help users find your model (optional): 
Provide any useful notes about your model (optional): 

Your model has been submitted as model version 272

To submit code used to create this model or to view current leaderboard navigate to Model Playground: 

 https://www.modelshare.org/detail/model:3164


In [72]:
# Get leaderboard to explore current best model architectures

# Get raw data in pandas data frame
data = mycompetition.get_leaderboard()

# Stylize leaderboard data
mycompetition.stylize_leaderboard(data)

Unnamed: 0,accuracy,f1_score,precision,recall,ml_framework,transfer_learning,deep_learning,model_type,depth,num_params,dense_layers,relu_act,softmax_act,loss,optimizer,memory_size,team,username,version
0,58.82%,57.65%,65.59%,59.50%,sklearn,False,False,GradientBoostingClassifier,,,,,,,,,,yatharth,58
1,55.88%,51.89%,69.50%,56.64%,sklearn,False,False,GradientBoostingClassifier,,,,,,,,,,yatharth,68
2,55.88%,51.89%,69.50%,56.64%,sklearn,False,False,GradientBoostingClassifier,,,,,,,,,,yatharth,66
3,52.94%,51.56%,71.39%,55.29%,sklearn,False,False,RandomForestClassifier,,,,,,,,,6.0,bobbie8881,123
4,52.94%,51.56%,71.39%,55.29%,sklearn,False,False,GaussianNB,,,,,,,,,6.0,bobbie8881,125
5,52.94%,50.96%,72.38%,55.29%,sklearn,False,False,RandomForestClassifier,,,,,,,,,5.0,halfbaked_em,127
6,55.88%,55.63%,63.33%,57.00%,sklearn,False,False,SVC,,,,,,,,,9.0,sofiazaidman,14
7,52.94%,51.25%,65.11%,55.29%,sklearn,False,False,RandomForestClassifier,,,,,,,,,2.0,ivster,29
8,50.00%,50.23%,66.11%,51.29%,sklearn,False,False,GradientBoostingClassifier,,,,,,,,,5.0,oliverhegi,159
9,50.00%,48.71%,67.34%,51.29%,sklearn,False,False,GradientBoostingClassifier,,,,,,,,,5.0,oliverhegi,155
