In [1]:
from IPython.core.display import HTML
css_file = 'style.css'
HTML(open(css_file, 'r').read())

In [2]:
from pandas import read_excel, get_dummies, concat
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import roc_auc_score

In [3]:
import pickle

# Share you machine learning model so that others can use it

## Creating the model

In [5]:
data = read_excel("Pickle_model.xlsx", sheetname = 1)

In [6]:
data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Cat1,Cat2,Cat3,Outcome
0,3,12,9.0,22,A,II,R,0
1,3,17,10.6,20,C,I,L,0
2,1,16,11.7,22,C,II,X,0
3,3,16,8.9,22,A,I,X,0
4,1,10,12.4,23,A,II,X,0


In [7]:
# The outcome vector
y = data.pop("Outcome")

In [8]:
# Variable data types
# Note that the Outcome variable has been removed by the .pop() method above
data.dtypes

Var1      int64
Var2      int64
Var3    float64
Var4      int64
Cat1     object
Cat2     object
Cat3     object
dtype: object

In [9]:
# Defining a value_counts type function for categorical values
def describe_categorical(X):
    """
    Returns the .describe method values when called on categorical variables in a dataset.
    """
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes == "object"]].describe().to_html()))

In [10]:
describe_categorical(data)

Unnamed: 0,Cat1,Cat2,Cat3
count,700,700,700
unique,3,2,4
top,B,I,X
freq,245,358,195


In [11]:
# Creating dummy variables
categorical_variables = ["Cat1", "Cat2", "Cat3"]

for variable in categorical_variables:
    data[variable].fillna("Missing", inplace = True)
    dummies = get_dummies(data[variable], prefix = variable)
    data = concat([data, dummies], axis = 1)
    data.drop([variable], axis = 1, inplace = True)

In [12]:
data.head()

Unnamed: 0,Var1,Var2,Var3,Var4,Cat1_A,Cat1_B,Cat1_C,Cat2_I,Cat2_II,Cat3_C,Cat3_L,Cat3_R,Cat3_X
0,3,12,9.0,22,1,0,0,0,1,0,0,1,0
1,3,17,10.6,20,0,0,1,1,0,0,1,0,0
2,1,16,11.7,22,0,0,1,0,1,0,0,0,1
3,3,16,8.9,22,1,0,0,1,0,0,0,0,1
4,1,10,12.4,23,1,0,0,0,1,0,0,0,1


In [13]:
model = RandomForestRegressor(100, oob_score = True, n_jobs = -1, random_state = 42)
model.fit(data, y)
print("Area under the curve: ", roc_auc_score(y, model.oob_prediction_))

Area under the curve:  0.897733333333


## Pickle the model

In [15]:
# Open the file to save as pkl file
# The wb stands for write and binary
decision_tree_model_pkl = open("Random_forest_regressor_model.pkl", "wb")

In [16]:
# Write to the file (dump the model)
# Open the file to save as pkl file
pickle.dump(model, decision_tree_model_pkl)

In [17]:
# Close the pickle file
decision_tree_model_pkl.close()