# -0- Import Libraries

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn import metrics
from sklearn.naive_bayes import BernoulliNB
from sklearn.model_selection import cross_val_score

from scipy import stats
import numpy as np
import warnings
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

###### A Possible way to deprecate warnings. We might not use that.

In [11]:

#--------------------------------#
# To use later
def fxn():
    warnings.warn("deprecated", DeprecationWarning)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    fxn()
# To use later
#--------------------------------#

# -1- Data Preparing & Cleaning
#### - We encode the labels/targets to 0, n-1.
#### - Read the Excel
#### - If testing some selected phases only, we drop the right columns and define our features
#### - Data Cleaning with Aggelos' Rules

In [12]:
# Data Preparing
# Encode labels with value 0-> n_classes -1
le = preprocessing.LabelEncoder()

# Read excel with Pandas library
path_to_dataset = "../datasets/appliances_combination.xls"
#path_to_dataset= "../datasets/one_appliance.xls"

# when you run it through azure notebooks
#path_to_dataset= "one_appliance.xls"
#path_to_dataset= "appliances_combination.xls"

df = pd.read_excel(path_to_dataset)

# Data Cleaning with Aggelos Rules for 50, 150, 250 phases
df = df[(df.I50 > 0.1) & (df.I150 > 0.01) & (df.I250 > 0.01)]
#print(df.shape)

# These columns will be our features - X
#features = ['I50', 'Φ50',]# 'I150', 'Φ150', 'I250', 'Φ250'] # # 50 Version
#features = ['I150', 'Φ150',] # 150
#features = ['I50', 'Φ50', 'I150', 'Φ150'] # 50-150
#features = ['I250', 'Φ250'] #250
#features = ['I50', 'Φ50', 'I150', 'Φ150',]# 'I250', 'Φ250'] # 50-150 Version
features = ['I50', 'Φ50', 'I150', 'Φ150', 'I250', 'Φ250'] # 50-150-250 Version
df.head()

Unnamed: 0,I50,Φ50,I150,Φ150,I250,Φ250,appliance
0,1.619,2.606,0.3099,-178.8,0.2033,5.325,PLAYR-TV+Tostiera
1,4.572,0.6399,0.2669,-177.2,0.2273,2.255,PLAYR-TV+Tostiera
2,4.573,-179.0,0.2675,2.851,0.2269,-174.2,PLAYR-TV+Tostiera
3,4.573,0.8874,0.2661,-178.2,0.2292,2.44,PLAYR-TV+Tostiera
4,4.572,0.9411,0.2664,-178.1,0.2284,3.35,PLAYR-TV+Tostiera


In [13]:
# If you are going to test some selected phases, then you are going to remove some columns

#df = df.drop(df.columns[cols],axis=1,inplace=True)
#removeColumns = ['I150', 'Φ150', 'I250', 'Φ250'] # 50 
#removeColumns = ['I50', 'Φ50', 'I250', 'Φ250'] # 150
#removeColumns = ['I50', 'Φ50', 'I150', 'Φ150'] # 250
#removeColumns = [ 'I250', 'Φ250'] # 50-150
#columns = [ 'I250', 'Φ250'] 
#df.drop(removeColumns, inplace=True, axis=1)

# -2- Data Scaling-Normalizing 
#### There is standard scaling, min-max scaling, robust scaling (preferred one) and normalizing

In [14]:
# Features - X
X = df[features]

# Calculate Z-score in order to find outliers
z = np.abs(stats.zscore(X))
print(z) # Visualize
threshold = 2.5 # Change the threshold arbitrarily
print(np.where(z > threshold))
df = df[(z < threshold).all(axis=1)] # Remove outliers that exceed the threshold given from dataset

# Now get as X the 'clean' features
X = df[features]

# Scale the inputs (4 options; put comment in the ones you are not using)

# -1- Standard Scaling
#scaler = StandardScaler()
#X = scaler.fit_transform(X)

# -2- Min-max scaling 
#scaler = preprocessing.MinMaxScaler()
#X = scaler.fit_transform(X)

# -3- Robust scaling
scaler = preprocessing.RobustScaler()
X = scaler.fit_transform(X)

# -4- Normalizing
#scaler = preprocessing.Normalizer()
#X = scaler.fit_transform(X)

# Label - Y
y = df['appliance']

# Convert y to integer 
le.fit(y) # Fit label encoder
y = le.transform(y) # Transform labels to normalized encoding

[[0.73248368 0.23790926 0.00890449 0.91041454 0.28352734 0.14491286]
 [0.41940973 0.18281675 0.04913007 0.90068722 0.10816284 0.09461722]
 [0.41979981 4.8509113  0.04856879 0.19394722 0.11108559 2.79623488]
 ...
 [0.10305838 0.38602939 0.15040756 1.20771107 1.40874006 0.54965345]
 [0.14635678 0.52579921 0.18324286 0.87819277 1.35759208 0.40777717]
 [0.23061312 0.47087763 0.14030439 0.85205058 1.41531622 0.42776436]]
(array([    2,     2,    73, ..., 26482, 26482, 26499], dtype=int64), array([1, 5, 1, ..., 1, 5, 1], dtype=int64))


# -3- Classifying (Method 1: Cross-validation)

In [15]:
# a function that runs cross validation by giving as parameter the classification method
# X,y, the score type (i.e 'accuracy', 'f1_score') and number of folds
# Returns the score.
scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

def cross_val(method, X, y, kfold_num):
    return cross_val_score(method, X, y, scoring='f1_score', cv=kfold_num)

clf = DecisionTreeClassifier(random_state = 42)
#clf = LogisticRegression()
print(cross_val(clf, X, y, 10))

clf = LogisticRegression()
print(cross_val(clf, X, y, 10))

ValueError: 'f1_score' is not a valid scoring value. Valid options are ['accuracy', 'adjusted_mutual_info_score', 'adjusted_rand_score', 'average_precision', 'completeness_score', 'explained_variance', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'fowlkes_mallows_score', 'homogeneity_score', 'mutual_info_score', 'neg_log_loss', 'neg_mean_absolute_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_median_absolute_error', 'normalized_mutual_info_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'r2', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'roc_auc', 'v_measure_score']

# -3- Classifying (Method 2: Train/Test Splitting)

In [9]:
# Split to train & test set 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform Logistic Regression
reg = LogisticRegression().fit(X_train, y_train)

y_pred = reg.predict(X_test)
print("Logistic Regression:")
print("\nAccuracy: %1.3f" % reg.score(X_test, y_test))
print("Precision: %1.3f" % metrics.precision_score(y_test, y_pred, average ='weighted'))
print("Recall: %1.3f" % metrics.recall_score(y_test, y_pred, average ='weighted'))
print("F1-Score: %1.3f" % metrics.f1_score(y_test, y_pred, average ='weighted'))
print("-----------------\n")

# Decision Trees
clf = DecisionTreeClassifier(random_state = 42) # Feel free to change 'min_samples_split' 
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
print("Decision Trees:")
print("\nAccuracy: %1.3f" % clf.score(X_test, y_test))
print("Precision: %1.3f" % metrics.precision_score(y_test, y_pred, average ='weighted'))
print("Recall: %1.3f" % metrics.recall_score(y_test, y_pred, average ='weighted'))
print("F1-Score: %1.3f" % metrics.f1_score(y_test, y_pred, average ='weighted'))
print("-----------------\n")

Logistic Regression:

Accuracy: 0.431
Precision: 0.365
Recall: 0.431
F1-Score: 0.356
-----------------



  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


Decision Trees:

Accuracy: 0.898
Precision: 0.898
Recall: 0.898
F1-Score: 0.898
-----------------

