In [20]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier

In [48]:
data = pd.read_csv("Sample.csv")
data.columns
for i in data.columns:
    if data[i].dtype == 'object':
        le.fit(data[i])
        data[i] = le.transform(data[i])

In [49]:
X = data[["current","powerfactor","activepower","reactivepower","apparentpower","activefundamentalpower","activeharmonicpower","meanphaseangle"]]
y = data.combinations
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=0)

In [50]:
# max_depth can be controlled for pruning
model = DecisionTreeClassifier(criterion = 'entropy',min_samples_split=20, random_state=99, max_depth=10)
model = model.fit(X_train,y_train)
print(model)
Y_test = model.predict(X_test)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=20, min_weight_fraction_leaf=0.0,
            presort=False, random_state=99, splitter='best')


In [31]:
import pickle
with open('DecisionTrees.pickle','wb') as f:
    pickle.dump(model,f)

In [34]:
pickle_in = open('DecisionTrees.pickle','rb')
model = pickle.load(pickle_in)

In [35]:
print(pd.DataFrame(model.feature_importances_,columns = ['Importance'],index = X_train.columns))

                        Importance
current                   0.184855
powerfactor               0.016205
activepower               0.114206
reactivepower             0.215876
apparentpower             0.058358
activefundamentalpower    0.169705
activeharmonicpower       0.008748
meanphaseangle            0.232048


In [36]:
# max_features: max num of features Random Forest is allowed to try in individual tree
# n_estimators: Max num of trees to build, Higher num of trees give you better performance but makes slower
# n_jobs: parameter tells the engine how many processors is it allowed to use, -1 no restriction
model= RandomForestClassifier(n_estimators=1000)
model = model.fit(X_train,y_train)
print(model)
print("----------------------------------------------")
print(pd.DataFrame(model.feature_importances_,columns = ['Importance'],index = X_train.columns))
Y_test = model.predict(X_test)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
----------------------------------------------
                        Importance
current                   0.155075
powerfactor               0.106332
activepower               0.140573
reactivepower             0.137823
apparentpower             0.134682
activefundamentalpower    0.137424
activeharmonicpower       0.066822
meanphaseangle            0.121270


In [37]:
model3 = AdaBoostClassifier(n_estimators=100)
model3 = model3.fit(X_train,y_train)
print(model3)
print("----------------------------------------------")
print(pd.DataFrame(model3.feature_importances_,columns = ['Importance'],index = X_train.columns))
Y_test = model.predict(X_test)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=1.0, n_estimators=100, random_state=None)
----------------------------------------------
                        Importance
current                       0.06
powerfactor                   0.09
activepower                   0.04
reactivepower                 0.09
apparentpower                 0.08
activefundamentalpower        0.12
activeharmonicpower           0.00
meanphaseangle                0.52


In [40]:
model4= GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0).fit(X_train, y_train)
model4 = model4.fit(X_train,y_train)
print(model4)
print("----------------------------------------------")
print(pd.DataFrame(model4.feature_importances_,columns = ['Importance'],index = X_train.columns))
Y_test = model.predict(X_test)

GradientBoostingClassifier(init=None, learning_rate=1.0, loss='deviance',
              max_depth=1, max_features=None, max_leaf_nodes=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              presort='auto', random_state=0, subsample=1.0, verbose=0,
              warm_start=False)
----------------------------------------------
                        Importance
current                   0.119286
powerfactor               0.032857
activepower               0.044762
reactivepower             0.309048
apparentpower             0.064048
activefundamentalpower    0.049048
activeharmonicpower       0.119286
meanphaseangle            0.238095


In [51]:
from sklearn import metrics
# Confusion Matrix
print(metrics.confusion_matrix(y_test, Y_test))
print(metrics.classification_report(y_test, Y_test))
print(metrics.accuracy_score(y_test, Y_test))

[[73  0  0 ...,  0  0  0]
 [ 0 37  0 ...,  0  0  0]
 [ 0  0 39 ...,  0  0  0]
 ..., 
 [ 0  0  0 ..., 27  0  0]
 [ 0  0  0 ...,  0 36  0]
 [ 0  0  0 ...,  0  0 29]]
             precision    recall  f1-score   support

          0       1.00      1.00      1.00        73
          1       1.00      1.00      1.00        37
          2       1.00      1.00      1.00        39
          3       0.98      1.00      0.99        44
          4       1.00      1.00      1.00        35
          5       1.00      1.00      1.00        62
          6       1.00      1.00      1.00        32
          7       1.00      1.00      1.00        37
          8       1.00      1.00      1.00        27
          9       1.00      1.00      1.00        37
         10       1.00      1.00      1.00        36
         11       1.00      1.00      1.00        35
         12       1.00      1.00      1.00        44
         13       1.00      1.00      1.00        33
         14       1.00      1.00      1.

In [52]:
from sklearn.metrics import explained_variance_score, mean_absolute_error, mean_squared_error, r2_score, jaccard_similarity_score, roc_auc_score

#ev = explained_variance_score(y_test, Y_test, multioutput='uniform_average')
# Best possible score is 1.0, lower values are worse.
#print("Explained Variance Score: {}". format(ev))

mae = mean_absolute_error(y_test, Y_test, multioutput='uniform_average')
# MAE output is non-negative floating point. The best value is 0.0.
print("Mean Absolute Error: {}".format(mae))

mse = mean_squared_error(y_test, Y_test, multioutput='uniform_average')
# MAE output is non-negative floating point. The best value is 0.0.
print("Mean Squared Error: {}".format(mse))

r2 = r2_score(y_test, Y_test)
# R^2 (coefficient of determination) regression score function.
# Best possible score is 1.0 and it can be negative (because the model can be arbitrarily worse). A constant model that always 
# predicts the expected value of y, disregarding the input features, would get a R^2 score of 0.0.
print("R - Squared value: {}".format(r2))

print('What percent of predictions are same: {}'.format(jaccard_similarity_score(y_test, Y_test)))

Mean Absolute Error: 0.0091683038637852
Mean Squared Error: 0.12835625409299278
R - Squared value: 0.9991588209138685
What percent of predictions are same: 0.9993451211525868
