In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
from flask import Flask, request, jsonify
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
import pandas as pd
import numpy as np
import joblib

df = pd.read_csv('/content/drive/MyDrive/df_cleaning_2021.csv')

X = ['AAE', 'EHW', 'Industry']
y = 'Injuries'

train, test = train_test_split(df, test_size=0.2, stratify=df[y], random_state=42)
train, val = train_test_split(train, test_size=0.2, stratify=train[y], random_state=42)

X_train, y_train = train[X], train[y]
X_val, y_val = val[X], val[y]
X_test, y_test = test[X], test[y]

cat_cols = ['Industry']
ordinal_encoder = OrdinalEncoder()
X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
X_val[cat_cols] = ordinal_encoder.transform(X_val[cat_cols])
X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols])

num_cols = ['AAE', 'EHW']
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

pp2 = make_pipeline(
    SimpleImputer(),
    RandomForestClassifier(random_state = 42)
)

ps2 = {
    "simpleimputer__strategy": ["median", "mean"],
    "randomforestclassifier__max_depth": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    "randomforestclassifier__n_estimators" : [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, 120, 130, 140, 150, 160, 170, 180, 190, 200],
    "randomforestclassifier__max_samples" : [0.2, 0.4, 0.6, 0.8, 1.0],
    "randomforestclassifier__max_features" : [0.2, 0.4, 0.6, 0.8, 1.0]
}

clf1 = RandomizedSearchCV(
    pp2,
    param_distributions = ps2,
    n_iter = 10,
    cv = 5,
    n_jobs = -1,
    random_state = 42
)

clf1.fit(X_train, y_train)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[cat_cols] = ordinal_encoder.fit_transform(X_train[cat_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_val[cat_cols] = ordinal_encoder.transform(X_val[cat_cols])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_test[cat_cols] = ordinal_encoder.transform(X_test[cat_cols])
A value is t

In [3]:
val_score = clf1.best_estimator_.score(X_val, y_val)
print("Validation score: {:.2f}%".format(val_score*100))

test_score = clf1.best_estimator_.score(X_test, y_test)
print("Test score: {:.2f}%".format(test_score*100))

Validation score: 75.62%
Test score: 75.14%


In [4]:
joblib.dump(clf1.best_estimator_, 'model.pkl')

['model.pkl']