# -0- Import Libraries

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix 
from sklearn.neural_network import MLPClassifier

from scipy import stats
import numpy as np
import warnings
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

###### A Possible way to deprecate warnings. We might not use that.

In [None]:

#--------------------------------#
# To use later
def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
# To use later
#--------------------------------#

# -1- Data Preparing & Cleaning
#### - We encode the labels/targets to 0, n-1.
#### - Read the Excel
#### - If testing some selected phases only, we drop the right columns and define our features
#### - Data Cleaning with Aggelos' Rules

In [2]:
# Data Preparing
# Encode labels with value 0-> n_classes -1
le = preprocessing.LabelEncoder()

# Read excel with Pandas library
path_to_dataset = "../datasets/appliances_combination.xls"
#path_to_dataset= "../datasets/one_appliance.xls"

# when you run it through azure notebooks
#path_to_dataset= "one_appliance.xls"
#path_to_dataset= "appliances_combination.xls"

df = pd.read_excel(path_to_dataset)

# Data Cleaning with Aggelos Rules for 50, 150, 250 phases
df = df[(df.I50 > 0.1) & (df.I150 > 0.01) & (df.I250 > 0.01)] # Clean useless current features


#df['Φ50'] = df['Φ50'].apply(pd.to_numeric)

# For angle between (90, 180):
# Modify by +180 degrees
rows_with_rads_to_decrease = df.loc[(df['Φ50'] > 90) & (df['Φ50'] < 180)]
rows_with_rads_to_decrease['Φ50'] -= 180
df.update(rows_with_rads_to_decrease)

# For angle between (-180, -90):
# Modify by -180 degrees
rows_with_rads_to_increase = (df.loc[(df['Φ50'] < -90) & (df['Φ50'] > -180)])
rows_with_rads_to_increase['Φ50'] += 180
df.update(rows_with_rads_to_increase)

print("\n")
print(df.shape)

#print(df.shape)

# These columns will be our features - X
#features = ['I50', 'Φ50',]# 'I150', 'Φ150', 'I250', 'Φ250'] # # 50 Version
#features = ['I150', 'Φ150',] # 150
#features = ['I50', 'Φ50', 'I150', 'Φ150'] # 50-150
#features = ['I250', 'Φ250'] #250
#features = ['I50', 'Φ50', 'I150', 'Φ150',]# 'I250', 'Φ250'] # 50-150 Version
features = ['I50', 'Φ50', 'I150', 'Φ150', 'I250', 'Φ250'] # 50-150-250 Version
df.head()



(26514, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,I50,Φ50,I150,Φ150,I250,Φ250,appliance
0,1.619,2.606,0.3099,-178.8,0.2033,5.325,PLAYR-TV+Tostiera
1,4.572,0.6399,0.2669,-177.2,0.2273,2.255,PLAYR-TV+Tostiera
2,4.573,1.0,0.2675,2.851,0.2269,-174.2,PLAYR-TV+Tostiera
3,4.573,0.8874,0.2661,-178.2,0.2292,2.44,PLAYR-TV+Tostiera
4,4.572,0.9411,0.2664,-178.1,0.2284,3.35,PLAYR-TV+Tostiera


In [None]:
# If you are going to test some selected phases, then you are going to remove some columns

#df = df.drop(df.columns[cols],axis=1,inplace=True)
#removeColumns = ['I150', 'Φ150', 'I250', 'Φ250'] # 50 
#removeColumns = ['I50', 'Φ50', 'I250', 'Φ250'] # 150
#removeColumns = ['I50', 'Φ50', 'I150', 'Φ150'] # 250
#removeColumns = [ 'I250', 'Φ250'] # 50-150
#columns = [ 'I250', 'Φ250'] 
#df.drop(removeColumns, inplace=True, axis=1)

# -2- Data Scaling-Normalizing 
#### There is standard scaling, min-max scaling, robust scaling (preferred one) and normalizing

In [3]:
# Features - X
X = df[features]

# Calculate Z-score in order to find outliers
z = np.abs(stats.zscore(X))
print(z) # Visualize
threshold = 2.5 # Change the threshold arbitrarily
print(np.where(z > threshold))
df = df[(z < threshold).all(axis=1)] # Remove outliers that exceed the threshold given from dataset

# Now get as X the 'clean' features
X = df[features]

# Scale the inputs (4 options; put comment in the ones you are not using)

# -1- Standard Scaling
#scaler = StandardScaler()
#X = scaler.fit_transform(X)

# -2- Min-max scaling 
#scaler = preprocessing.MinMaxScaler()
#X = scaler.fit_transform(X)

# -3- Robust scaling
scaler = preprocessing.RobustScaler()
X = scaler.fit_transform(X)

# -4- Normalizing
#scaler = preprocessing.Normalizer()
#X = scaler.fit_transform(X)

# Label - Y
y = df['appliance']

# Convert y to integer 
le.fit(y) # Fit label encoder
y = le.transform(y) # Transform labels to normalized encoding

[[0.73248368 0.32116261 0.00890449 0.91041454 0.28352734 0.14491286]
 [0.41940973 0.2027718  0.04913007 0.90068722 0.10816284 0.09461722]
 [0.41979981 0.22445561 0.04856879 0.19394722 0.11108559 2.79623488]
 ...
 [0.10305838 0.63946472 0.15040756 1.20771107 1.40874006 0.54965345]
 [0.14635678 0.93982244 0.18324286 0.87819277 1.35759208 0.40777717]
 [0.23061312 0.82179896 0.14030439 0.85205058 1.41531622 0.42776436]]
(array([    2,    73,    94, ..., 26482, 26497, 26506], dtype=int64), array([5, 5, 5, ..., 5, 1, 1], dtype=int64))


# -3- Classification (Method 1: Cross-validation)
## WARNING: DOES NOT WORK YET IF YOU WANT ALL THE METRICS

In [None]:
# a function that runs cross validation by giving as parameter the classification method
# X,y, the score type (i.e 'accuracy', 'f1_score') and number of folds
# Returns the score.
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

def cross_val(method, X, y, kfold_num):
    return cross_val_score(method, X, y, scoring='f1_score', cv=kfold_num)

clf = DecisionTreeClassifier(random_state = 42)
#clf = LogisticRegression()
print(cross_val(clf, X, y, 10))

clf = LogisticRegression()
print(cross_val(clf, X, y, 10))

# -3- Classification (Method 2: Train/Test Splitting)

#### printMetrics method

In [4]:
def printMetrics(clf, X_test, y_test):
    # Uncomment if you want confusion matrix to be shown
    #cm = confusion_matrix(y_test, y_pred) 
    #print("Confusion matrix")
    #print(cm)
    
    
    # This will print precision, recall, f1-score, support for all the categories
    #target_names = ['class 0', 'class 1', 'class 2']
    print("Classification report for classifier \n%s:\n%s" % (clf, metrics.classification_report(y_test, y_pred)))
    print("Accuracy: %1.3f" % clf.score(X_test, y_test))
    print("-----------------\n")
    
    

# -3.1- Classification Methods

In [5]:
# Split to train & test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform Logistic Regression
clf = LogisticRegression().fit(X_train, y_train)
y_pred = clf.predict(X_test)

print("Logistic Regression\n\n")
printMetrics(clf, X_test, y_test)

Logistic Regression


Classification report for classifier 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False):
             precision    recall  f1-score   support

          0       0.39      0.95      0.55       118
          1       0.41      0.22      0.29       144
          2       0.69      0.71      0.70       132
          3       0.00      0.00      0.00       130
          4       0.25      0.34      0.29        53
          5       0.30      0.16      0.21        93
          6       0.23      0.06      0.10        96
          7       0.49      0.29      0.37       141
          8       0.00      0.00      0.00        51
          9       0.40      0.90      0.56        86
         10       0.00      0.00      0.00        39
         11       0.00      0.00     

  'precision', 'predicted', average, warn_for)


In [7]:
# Decision Trees
clf = DecisionTreeClassifier(random_state = 42) # Feel free to change 'min_samples_split' 
clf.fit(X_train, y_train)

print("Decision Trees:")
y_pred = clf.predict(X_test)

printMetrics(clf, X_test, y_test)

Decision Trees:
Classification report for classifier 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'):
             precision    recall  f1-score   support

          0       0.99      0.97      0.98       118
          1       0.85      0.81      0.83       144
          2       0.94      0.92      0.93       132
          3       0.82      0.83      0.82       130
          4       0.91      0.94      0.93        53
          5       0.97      0.92      0.95        93
          6       0.93      0.94      0.93        96
          7       0.98      0.96      0.97       141
          8       0.76      0.76      0.76        51
          9       0.92      0.91      0.91        86
         10  

In [18]:
# Multi-layer Perceptron
clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(18,), random_state=42)
clf.fit(X_train, y_train)

print("Multi-layer Perceptron:")
y_pred = clf.predict(X_test)

printMetrics(clf, X_test, y_test)

Multi-layer Perceptron:
Classification report for classifier 
MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(18,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=42, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False):
             precision    recall  f1-score   support

          0       0.89      0.97      0.93       118
          1       0.61      0.58      0.59       144
          2       0.79      0.70      0.75       132
          3       0.46      0.49      0.48       130
          4       0.49      0.66      0.56        53
          5       0.71      0.65      0.68        93
          6       0.72      0.94      0.81        96
          7       0.93      0.91      0.92       141
          8       0.53      

In [28]:
# k-Nearest Neighbors
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(3)
clf.fit(X_train, y_train)

print("k-Nearest Neighbors")
y_pred = clf.predict(X_test)

printMetrics(clf, X_test, y_test)

k-Nearest Neighbors
Classification report for classifier 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform'):
             precision    recall  f1-score   support

          0       0.88      0.95      0.91       118
          1       0.74      0.76      0.75       144
          2       0.85      0.91      0.88       132
          3       0.81      0.85      0.83       130
          4       0.76      0.91      0.83        53
          5       0.89      0.87      0.88        93
          6       0.81      0.88      0.84        96
          7       0.96      0.97      0.96       141
          8       0.64      0.75      0.69        51
          9       0.75      0.74      0.75        86
         10       0.95      1.00      0.97        39
         11       0.92      0.96      0.94        46
         12       0.98      1.00      0.99        40
         13       0.86      0.97  

In [33]:
# Support Vector Machines (C-Support Vector)
from sklearn.svm import SVC

clf = SVC(gamma=2, C=1)
clf.fit(X_train, y_train)

print("Support Vector Machines (C-Support Vector)")
y_pred = clf.predict(X_test)

printMetrics(clf, X_test, y_test)

Support Vector Machines (C-Support Vector)
Classification report for classifier 
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=2, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False):
             precision    recall  f1-score   support

          0       0.91      0.97      0.94       118
          1       0.84      0.76      0.80       144
          2       0.81      0.91      0.85       132
          3       0.78      0.78      0.78       130
          4       0.74      0.92      0.82        53
          5       0.95      0.84      0.89        93
          6       0.76      0.94      0.84        96
          7       0.98      0.96      0.97       141
          8       0.90      0.69      0.78        51
          9       0.88      0.58      0.70        86
         10       0.95      1.00      0.97        39
         11       0.94      0.96      0.95        46
         1

In [40]:
# Random Forests
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(max_depth=5, n_estimators=100, max_features="auto")
clf.fit(X_train, y_train)

print("Random Forests")
y_pred = clf.predict(X_test)

printMetrics(clf, X_test, y_test)

Random Forests


  'precision', 'predicted', average, warn_for)


Classification report for classifier 
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False):
             precision    recall  f1-score   support

          0       0.58      0.92      0.71       118
          1       0.53      0.43      0.47       144
          2       0.79      0.73      0.76       132
          3       0.78      0.29      0.42       130
          4       0.26      0.60      0.37        53
          5       0.67      0.91      0.78        93
          6       0.60      0.64      0.62        96
          7       0.81      0.95      0.87       141
          8       1.00      0.08      0.15        51
          9

In [43]:
# QuadraticDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

clf = QuadraticDiscriminantAnalysis()
clf.fit(X_train, y_train)

print("QuadraticDiscriminantAnalysis Classifier")
y_pred = clf.predict(X_test)

printMetrics(clf, X_test, y_test)

QuadraticDiscriminantAnalysis Classifier
Classification report for classifier 
QuadraticDiscriminantAnalysis(priors=None, reg_param=0.0,
               store_covariance=False, store_covariances=None, tol=0.0001):
             precision    recall  f1-score   support

          0       0.90      0.95      0.93       118
          1       0.48      0.21      0.29       144
          2       0.71      0.70      0.71       132
          3       0.60      0.43      0.50       130
          4       0.61      0.96      0.75        53
          5       0.77      0.66      0.71        93
          6       0.76      0.93      0.84        96
          7       0.90      0.87      0.88       141
          8       0.39      0.55      0.46        51
          9       0.75      0.64      0.69        86
         10       1.00      1.00      1.00        39
         11       0.93      0.89      0.91        46
         12       0.91      1.00      0.95        40
         13       1.00      0.97      0.98  

## Proposal: An iterating way of evaluating classifiers

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

names = ["Nearest Neighbors", "Linear SVM", "RBF SVM", "Gaussian Process",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis()]




In [None]:
for clf in classifiers:
    clf.fit(X_train, y_train)
    print(clf)
    y_pred = clf.predict(X_test)
    
    printMetrics(clf, X_test, y_test)