# End-to-End Machine Learning Project 

## Training the model: example 1

In [2]:
from joblib import dump
from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [3]:
iris = datasets.load_iris(return_X_y=True)
X = iris[0]
y = iris[1]

In [8]:
X[:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

In [5]:
clf_pipeline = [('scaling', MinMaxScaler()), 
                ('clf', RandomForestClassifier(random_state=42))]

pipeline = Pipeline(clf_pipeline)

pipeline.fit(X_train, y_train)

In [6]:
dump(pipeline, 'data/iris_dt_v1.joblib')

['data/iris_dt_v1.joblib']

## Creating the API: example 1

https://fastapi.tiangolo.com/

File: `app.py`

Run: `uvicorn app:app --reload`

Docs: `http://localhost:8000/docs`

## Training the model: example 2

In [9]:
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
import pandas as pd
import joblib
import gzip

In [10]:
# Preselected feature
selected_features = [
    'concavity_mean',
    'concave_points_mean',
    'perimeter_se',
    'area_se',
    'texture_worst',
    'area_worst'
]

In [11]:
# Load the dataset
data = pd.read_csv('data/breast_cancer.csv')

# Preprocess dataset
data = data.set_index('id')
data['diagnosis'] = data['diagnosis'].replace(['B', 'M'], [0, 1])  # Encode y, B -> 0 , M -> 1

y = data.pop('diagnosis')
X = data
X = X[selected_features.copy()]

In [12]:
X.head()

Unnamed: 0_level_0,concavity_mean,concave_points_mean,perimeter_se,area_se,texture_worst,area_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
842302,0.3001,0.1471,8.589,153.4,17.33,2019.0
842517,0.0869,0.07017,3.398,74.08,23.41,1956.0
84300903,0.1974,0.1279,4.585,94.03,25.53,1709.0
84348301,0.2414,0.1052,3.445,27.23,26.5,567.7
84358402,0.198,0.1043,5.438,94.44,16.67,1575.0


In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Create an ensemble of 3 models
estimators = []
estimators.append(('logistic', LogisticRegression()))
estimators.append(('cart', DecisionTreeClassifier()))
estimators.append(('svm', SVC()))

In [15]:
# Create the Ensemble Model
ensemble = VotingClassifier(estimators)

# Make preprocess Pipeline
pipe = Pipeline([
    ('imputer', SimpleImputer()),  # Missing value Imputer
    ('scaler', MinMaxScaler(feature_range=(0, 1))),  # Min Max Scaler
    ('model', ensemble)  # Ensemble Model
])

In [16]:
# Train the model
pipe.fit(X_train, y_train)

In [17]:
# Test Accuracy
print("Accuracy: %s%%" % str(round(pipe.score(X_test, y_test), 3) * 100))

Accuracy: 95.6%


In [18]:
# Export model
joblib.dump(pipe, gzip.open('data/model_binary.dat.gz', "wb"))

## Creating the API: example 2

https://fastapi.tiangolo.com/

File: `app.py`

Run: `uvicorn cancer:app --reload`

Docs: `http://localhost:8000/docs`

    {
      "concavity_mean": 0.3001,
      "concave_points_mean": 0.1471,
      "perimeter_se": 8.589,
      "area_se": 153.4,
      "texture_worst": 17.33,
      "area_worst": 2019.0
    }