# Example: Classification Model

In this example we'll demonstrate:
* Loading data file
* Splitting data sets
* Training a model (RandomForestClassifier)
* Plot accuracy of testing data
* Predicting a result

In [None]:
import numpy as np
import matplotlib.pyplot as plt
% matplotlib inline
plt.rcParams["figure.dpi"] = 300
np.set_printoptions(precision=3)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale, StandardScaler

## Data set

The data set has the following columns

- Number of siblings with asthma
- Number of episodes
- Average days between episodes
- Days since last episode
- Last episode symptom severity (PASS score average - Pediatric Asthma Severity Score)
- Lung function measurements (FEV1 - Forced expiratory volume)
- Has Asthma


In [None]:
data = pd.read_csv(
    "data/episode_data.csv", header=None, index_col=False,
    names=['num_siblings_with_asthma', 'num_episodes', 'average_days_between', 'days_since_last_episode',  'last_episode_pass',
           'last_episode_fev1', 'has_asthma'])
data.head()

In [None]:
data.shape

In [None]:
data.columns

In [None]:
y = data.has_asthma.values

In [None]:
data.has_asthma.value_counts()

In [None]:
X = data.drop('has_asthma', axis=1).values

In [None]:
X.shape

In [None]:
y.shape

## Splitting data set

The data set is split randomly.  The `random_state` parameter allows you to use a specific seed value to split the data set accordingly.  

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, stratify=y, random_state=0)

# Training

Start training a RandomForestClassifier through a range of n_estimator values and plot accuracy against testing data.  

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(warm_start=True)

In [None]:
train_scores = []
test_scores = []

estimator_range = range(1, 100, 5)
for n_estimators in estimator_range:
    rf.n_estimators = n_estimators
    rf.fit(X_train, y_train)
    train_scores.append(rf.score(X_train, y_train))
    test_scores.append(rf.score(X_test, y_test))

In [None]:
plt.plot(estimator_range, test_scores, label="test scores")
plt.plot(estimator_range, train_scores, label="train scores")
plt.ylabel("accuracy")
plt.xlabel("n_estimators")
plt.legend()

In [None]:
rf.feature_importances_

In [None]:
features_list = np.delete(data.columns,-1)
features_list

In [None]:
pd.Series(rf.feature_importances_, index=features_list
).plot(kind="barh")

# Predict

Predict on training set

In [None]:
rf.predict(X_train)[:10]

In [None]:
rf.predict(X_test)[:10]

In [None]:
rf.score(X_train, y_train)

In [None]:
rf.score(X_test, y_test)