# 0. Import libraries

In [1]:
import numpy as np
np.set_printoptions(threshold=10000,suppress=True)
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')

# 1. Read pre-processed data

In [2]:
df = pd.read_csv('data/preprocess-data.csv')
df

Unnamed: 0,date,valeur,qualificatif,temperature,humidity
0,2019-10-01,35.409255,Bon,291.608974,79.324786
1,2019-10-02,23.512584,Bon,289.831760,75.349785
2,2019-10-03,34.669224,Bon,288.229605,72.039474
3,2019-10-04,21.981365,Bon,289.450428,77.387580
4,2019-10-05,21.669476,Bon,290.004077,78.336910
...,...,...,...,...,...
87,2019-12-27,19.411680,Très bon,286.051875,84.543750
88,2019-12-28,15.648780,Très bon,284.314968,82.893843
89,2019-12-29,29.964674,Bon,281.876160,84.588608
90,2019-12-30,59.634476,Moyen,282.037917,83.122917


# 2. Seperate dataset to feature & label set

In [3]:
Y = df.values[:,2]
dfX = df.drop(['qualificatif', 'date'], axis=1)
X = dfX.values
print(X)
print(Y)

[[ 35.40925545 291.60897436  79.32478632]
 [ 23.51258405 289.83175966  75.34978541]
 [ 34.66922426 288.22960526  72.03947368]
 [ 21.98136459 289.45042827  77.3875803 ]
 [ 21.66947578 290.00407725  78.33690987]
 [ 23.43752597 289.65851528  78.27510917]
 [ 26.11668443 288.76184211  76.96052632]
 [ 24.67594973 290.13169935  77.32026144]
 [ 16.96153542 289.00032258  78.98924731]
 [ 22.18934812 289.25343348  75.33261803]
 [ 33.76499921 290.07441113  75.93147752]
 [ 30.51595275 291.33678038  79.78464819]
 [ 37.75002993 292.53400853  76.47334755]
 [ 37.16592586 291.41559829  78.93589744]
 [ 26.57727126 289.1395966   80.06581741]
 [ 21.12705143 288.94658849  78.44776119]
 [ 30.75551601 289.24765458  81.03624733]
 [ 28.30384926 289.23571429  81.60554371]
 [ 40.37571383 289.3517094   83.78632479]
 [ 39.92010781 289.2264454   83.85867238]
 [ 16.90840675 288.10053533  84.19914347]
 [ 12.6931172  287.81072187  85.25053079]
 [ 23.57408326 289.12840173  83.50323974]
 [ 29.53141209 288.94542484  82.44

In [4]:
# le = preprocessing.LabelEncoder()
# Yle = le.fit_transform(Y)
# Yle

# 3. Seperate training & test set

In [5]:
(X_train, X_test, Y_train, Y_test) = train_test_split(X,Y,test_size=0.3, random_state=1)

# 4. Normalize feature sets

In [6]:
ss = StandardScaler()
Xss_train = ss.fit_transform(X_train)
Xss_test = ss.transform(X_test)

# 5. Import classification libraries

In [7]:
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
import time

# 6. Find best parameters for classfiers

In [8]:
tree_param = {'random_state': range(20)}
knn_param = {'n_neighbors': range(1, 20)}

In [9]:
print("# Tuning hyper-parameters for accuracy")
print()

tr = GridSearchCV(tree.DecisionTreeClassifier(), tree_param, cv=5,
                   scoring='accuracy')
tr.fit(Xss_train, Y_train)

print("Best parameters set found on development set:")
print()
print(tr.best_params_)
print()
print("Grid scores on development set:")
print()
means = tr.cv_results_['mean_test_score']
stds = tr.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, tr.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
Y_true, Y_pred = Y_test, tr.predict(Xss_test)
print(metrics.classification_report(Y_true, Y_pred))
print()


# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'random_state': 0}

Grid scores on development set:

1.000 (+/-0.000) for {'random_state': 0}
1.000 (+/-0.000) for {'random_state': 1}
1.000 (+/-0.000) for {'random_state': 2}
0.969 (+/-0.071) for {'random_state': 3}
0.969 (+/-0.071) for {'random_state': 4}
0.969 (+/-0.071) for {'random_state': 5}
1.000 (+/-0.000) for {'random_state': 6}
1.000 (+/-0.000) for {'random_state': 7}
1.000 (+/-0.000) for {'random_state': 8}
0.984 (+/-0.059) for {'random_state': 9}
0.969 (+/-0.071) for {'random_state': 10}
1.000 (+/-0.000) for {'random_state': 11}
0.969 (+/-0.071) for {'random_state': 12}
0.969 (+/-0.071) for {'random_state': 13}
0.969 (+/-0.071) for {'random_state': 14}
0.969 (+/-0.071) for {'random_state': 15}
1.000 (+/-0.000) for {'random_state': 16}
0.984 (+/-0.059) for {'random_state': 17}
0.984 (+/-0.059) for {'random_state': 18}
1.000 (+/-0.000) for {'random_state': 19}

Detailed classification repo

In [10]:
print("# Tuning hyper-parameters for accuracy")
print()

knn = GridSearchCV(KNeighborsClassifier(), knn_param, cv=5,
                   scoring='accuracy')
knn.fit(Xss_train, Y_train)

print("Best parameters set found on development set:")
print()
print(knn.best_params_)
print()
print("Grid scores on development set:")
print()
means = knn.cv_results_['mean_test_score']
stds = knn.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, knn.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
          % (mean, std * 2, params))
print()

print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
Y_true, Y_pred = Y_test, knn.predict(Xss_test)
print(metrics.classification_report(Y_true, Y_pred))
print()

# Tuning hyper-parameters for accuracy

Best parameters set found on development set:

{'n_neighbors': 5}

Grid scores on development set:

0.734 (+/-0.248) for {'n_neighbors': 1}
0.750 (+/-0.189) for {'n_neighbors': 2}
0.797 (+/-0.303) for {'n_neighbors': 3}
0.781 (+/-0.288) for {'n_neighbors': 4}
0.812 (+/-0.319) for {'n_neighbors': 5}
0.719 (+/-0.247) for {'n_neighbors': 6}
0.812 (+/-0.190) for {'n_neighbors': 7}
0.734 (+/-0.234) for {'n_neighbors': 8}
0.766 (+/-0.229) for {'n_neighbors': 9}
0.750 (+/-0.231) for {'n_neighbors': 10}
0.766 (+/-0.258) for {'n_neighbors': 11}
0.766 (+/-0.186) for {'n_neighbors': 12}
0.781 (+/-0.178) for {'n_neighbors': 13}
0.766 (+/-0.137) for {'n_neighbors': 14}
0.766 (+/-0.137) for {'n_neighbors': 15}
0.719 (+/-0.165) for {'n_neighbors': 16}
0.734 (+/-0.146) for {'n_neighbors': 17}
0.703 (+/-0.131) for {'n_neighbors': 18}
0.703 (+/-0.131) for {'n_neighbors': 19}

Detailed classification report:

The model is trained on the full development set.
The sc

## 7. Compare the classifiers

In [11]:
clfs = {
    'GaussianNB': GaussianNB(),
    'tree CART': tree.DecisionTreeClassifier(random_state=0),
    'MLP':MLPClassifier(solver='lbfgs', alpha=1e-5,hidden_layer_sizes=(20, 10), random_state=1),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Bagging': BaggingClassifier(n_estimators=50, random_state=1),
    'AdaBoost': AdaBoostClassifier(n_estimators=50, random_state=1),
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'RF': RandomForestClassifier(n_estimators=50, random_state=1),
}

In [12]:
def run_classifiers(clfs, X, Y):
    kf = KFold(n_splits=10, shuffle=True, random_state=0)
    result = {}

    for i in clfs:
        time_start = time.time()
        clf = clfs[i]
        
        # calculer l'Accuracy par cross_val_score
        cv_acc = cross_val_score(clf, X, Y, cv=kf)
        
        result.update({i: {'Accuracy': (np.mean(cv_acc), np.std(cv_acc)), 
                           'Execution time': (time.time() - time_start, 0.0)}
                      })
        
    return result

def print_results(results, ls_index):
    for i in results.keys():
        print('------------------------------------------------------------------------')
        print(i)
        print('------------------------------------------------------------------------')
        for index in ls_index:
            try:
                print("{0}: {1:.3f} +/- {2:.3f}".format(index, results[i][index][0], results[i][index][1]))
            except:
                print('<<< {0} >>> n\'est pas dans liste: Accuracy, Execution time'.format(index))
    #     print("AUC: {0:.3f} +/- {1:.3f}".format(np.mean(aucs), np.std(aucs)))
    #     print("Precision: {0:.3f} +/- {1:.3f}".format(np.mean(cv_acc_2), np.std(cv_acc_2)))
    #     print("Execution time: {0:.3f}s".format(time.time() - time_start))
        print('------------------------------------------------------------------------')

In [13]:
results = run_classifiers(clfs, X, Y)
print_results(results, ['Accuracy', 'Execution time'])

------------------------------------------------------------------------
GaussianNB
------------------------------------------------------------------------
Accuracy: 0.890 +/- 0.070
Execution time: 0.039 +/- 0.000
------------------------------------------------------------------------
------------------------------------------------------------------------
tree CART
------------------------------------------------------------------------
Accuracy: 0.922 +/- 0.071
Execution time: 0.021 +/- 0.000
------------------------------------------------------------------------
------------------------------------------------------------------------
MLP
------------------------------------------------------------------------
Accuracy: 0.944 +/- 0.075
Execution time: 0.947 +/- 0.000
------------------------------------------------------------------------
------------------------------------------------------------------------
KNN
-------------------------------------------------------------------

We choose `Decision Tree` algorithm based on its accuracy and execution time

# 8. Create a pipeline

In [14]:
from sklearn.pipeline import Pipeline

pip = Pipeline([('ss', StandardScaler()),
                ('clf', tree.DecisionTreeClassifier(random_state=0))])
pip.fit(X_train, Y_train)

tree_score =  pip.score(X_test, Y_test)

print('CART\'s accuracy score: ',tree_score)

CART's accuracy score:  0.8571428571428571


# 9. Demo a prediction with entered values of air quality index, temperature and humidity

In [15]:
arr = np.array([35, 291,  79])

pip.predict(arr.reshape(1,-1))


array(['Bon'], dtype=object)