In [23]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics

le = preprocessing.LabelEncoder()

In [24]:
def preparexy(df):
    # Data Preparing
    # Encode labels with value 0-> n_classes -1
    
    features = ['I50', 'Φ50', 'I150', 'Φ150', 'I250', 'Φ250']
    
    # Features - X
    X = df[features]

    # Calculate Z-score in order to find outliers
    z = np.abs(stats.zscore(X))
    #print(z) # Visualize
    threshold = 2.5 # Change the threshold arbitrarily
    #print(np.where(z > threshold))
    df = df[(z < threshold).all(axis=1)] # Remove outliers that exceed the threshold given from dataset

    # Now get as X the 'clean' features
    X = df[features]

    # Scale the inputs (4 options; put comment in the ones you are not using)

    # -1- Standard Scaling
    #scaler = StandardScaler()
    #X = scaler.fit_transform(X)

    # -2- Min-max scaling 
    #scaler = preprocessing.MinMaxScaler()
    #X = scaler.fit_transform(X)

    # -3- Robust scaling
    scaler = preprocessing.RobustScaler()
    X = scaler.fit_transform(X)

    # -4- Normalizing
    #scaler = preprocessing.Normalizer()
    #X = scaler.fit_transform(X)

    # Label - Y
    y = df['appliance']

    # Convert y to integer 
    le.fit(y) # Fit label encoder
    y = le.transform(y) # Transform labels to normalized encoding
    
    return(X,y)

def data_process(dataset_path):
    df = pd.read_excel(dataset_path)

    # Data Cleaning with Aggelos Rules for 50, 150, 250 phases
    df = df[(df.I50 > 0.1) & (df.I150 > 0.01) & (df.I250 > 0.01)] # Clean useless current features


    #df['Φ50'] = df['Φ50'].apply(pd.to_numeric)

    # For angle between (90, 180):
    # Modify by +180 degrees
    rows_with_rads_to_decrease = df.loc[(df['Φ50'] > 90) & (df['Φ50'] < 180)]
    rows_with_rads_to_decrease['Φ50'] -= 180
    df.update(rows_with_rads_to_decrease)

    # For angle between (-180, -90):
    # Modify by -180 degrees
    rows_with_rads_to_increase = (df.loc[(df['Φ50'] < -90) & (df['Φ50'] > -180)])
    rows_with_rads_to_increase['Φ50'] += 180
    df.update(rows_with_rads_to_increase)
    
    return df

### Load Datasets 
Split them if you have to.

In [30]:
# Uncomment the following if you want to train on house1 and test on house2:
#X_train, y_train = preparexy(data_process("../datasets/appliances_combination_daskio.xls"))
#X_test, y_test = preparexy(data_process("../datasets/appliances_combination_veroia.xls"))

X, y = preparexy(data_process("../datasets/appliances_combination_daskio.xls"))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


## printMetrics method (Used when splitting train/test so you have a $X_{test}$ and a $y_{test}$)
Takes as input the $classifier$, the $X_{test}$ and the $y_{test}$.

Then it prints a detailed report with all the known metrics. 

In [31]:
def printMetrics(clf, X_test, y_test):
    # Uncomment if you want confusion matrix to be shown
    #cm = confusion_matrix(y_test, y_pred) 
    #print("Confusion matrix")
    #print(cm)
    
    
    # This will print precision, recall, f1-score, support for all the categories
    #target_names = ['class 0', 'class 1', 'class 2']
    print("Classification report for classifier \n%s:\n%s" % (clf, metrics.classification_report(y_test, y_pred)))
    print("Accuracy: %1.3f" % clf.score(X_test, y_test))
    print("-----------------\n")   

## Decision Trees

In [27]:
# Decision Trees
clf = DecisionTreeClassifier(random_state = 42) # Feel free to change 'min_samples_split' 
clf.fit(X_train, y_train)

print("Decision Trees:")
y_pred = clf.predict(X_test)

printMetrics(clf, X_test, y_test)

Decision Trees:
Classification report for classifier 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'):
             precision    recall  f1-score   support

          0       0.99      0.97      0.98       118
          1       0.85      0.81      0.83       144
          2       0.94      0.92      0.93       132
          3       0.82      0.83      0.82       130
          4       0.91      0.94      0.93        53
          5       0.97      0.92      0.95        93
          6       0.93      0.94      0.93        96
          7       0.98      0.96      0.97       141
          8       0.76      0.76      0.76        51
          9       0.92      0.91      0.91        86
         10  

## Multi-layer Perceptron with 18 neurons in 1 hidden layer

In [None]:
from sklearn.neural_network import MLPClassifier

print(X_test.shape, y_test.shape)
print(le.inverse_transform(y_test[1])) # Check one TRUE class 

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(18,), random_state=42)
clf.fit(X_train, y_train)

print("Multi-layer Perceptron:")
y_pred = clf.predict(X_test)
print(le.inverse_transform(y_pred[1])) # check one PREDICTED class

#printMetrics(clf, X_test, y_test)

(4525, 6) (4525,)
DVD-TV+Pistolaki+Mati


  if diff:
