# Machine Learning with Flask

In [1]:
import pandas as pd
from sklearn.datasets import load_wine

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.ensemble import RandomForestClassifier

import pickle

In [2]:
data = load_wine()
df = pd.DataFrame(data['data'])
df.columns = data['feature_names']
y = data['target']
df.head()

Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


### Filter Own Columns

In [3]:
# own class that can be inserted to pipeline as any other sklearn object.
class RawFeats:
    def __init__(self, feats):
        self.feats = feats

    def fit(self, X, y=None):
        pass


    def transform(self, X, y=None):
        return X[self.feats]

    def fit_transform(self, X, y=None):
        self.fit(X)
        return self.transform(X)


# features we want to keep for PCA
feats = ['alcohol','malic_acid','ash','alcalinity_of_ash','magnesium',
         'total_phenols','flavanoids','nonflavanoid_phenols']
# creating class object with indexes we want to keep.
raw_feats = RawFeats(feats)

### Scaling and PCA

In [4]:
sc = StandardScaler()
pca = PCA(n_components=2)

### Select KBest

In [5]:
selection = SelectKBest(k=4)

### Random Forest

In [7]:
rf = RandomForestClassifier()

### Pipeline

In [8]:
PCA_pipeline = Pipeline([
    ("rawFeats", raw_feats),
    ("scaler", sc),
    ("pca", pca)
])

kbest_pipeline = Pipeline([("kBest", selection)])

In [9]:
all_features = FeatureUnion([
    ("pcaPipeline", PCA_pipeline), 
    ("kBestPipeline", kbest_pipeline)
])

In [10]:
main_pipeline = Pipeline([
    ("features", all_features),
    ("rf", rf)
])


In [11]:
# set up our parameters grid
param_grid = {"features__pcaPipeline__pca__n_components": [1, 2, 3],
                  "features__kBestPipeline__kBest__k": [1, 2, 3],
                  "rf__n_estimators":[2, 5, 10],
                  "rf__max_depth":[2, 4, 6]
             }

# create a Grid Search object
grid_search = GridSearchCV(main_pipeline, param_grid, n_jobs = -1, verbose=10, refit=True)    

# fit the model and tune parameters
grid_search.fit(df, y)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
[CV 1/5; 2/81] START features__kBestPipeline__kBest__k=1, features__pcaPipeline__pca__n_components=1, rf__max_depth=2, rf__n_estimators=5
[CV 2/5; 1/81] START features__kBestPipeline__kBest__k=1, features__pcaPipeline__pca__n_components=1, rf__max_depth=2, rf__n_estimators=2
[CV 4/5; 1/81] START features__kBestPipeline__kBest__k=1, features__pcaPipeline__pca__n_components=1, rf__max_depth=2, rf__n_estimators=2
[CV 1/5; 1/81] START features__kBestPipeline__kBest__k=1, features__pcaPipeline__pca__n_components=1, rf__max_depth=2, rf__n_estimators=2
[CV 3/5; 1/81] START features__kBestPipeline__kBest__k=1, features__pcaPipeline__pca__n_components=1, rf__max_depth=2, rf__n_estimators=2
[CV 3/5; 2/81] START features__kBestPipeline__kBest__k=1, features__pcaPipeline__pca__n_components=1, rf__max_depth=2, rf__n_estimators=5
[CV 5/5; 1/81] START features__kBestPipeline__kBest__k=1, features__pcaPipeline__pca__n_components=1, rf__max_

In [12]:
print(grid_search.best_params_)

{'features__kBestPipeline__kBest__k': 3, 'features__pcaPipeline__pca__n_components': 2, 'rf__max_depth': 6, 'rf__n_estimators': 5}


In [13]:
pickle.dump( grid_search, open( "model.p", "wb" ) )

# Continue in Flask_ML2