# Split the project into steps

1. Read the file
2. Remove Outliers  
   1. Quartiles
   2. Deviation **(method used)**
   3. By hand in special cases 
3. Normalization and standardization of the data
   1. Z-Score
   2. Min Max **(method used)**
4. Split the data into train, test and validation sets
5. Train the NN 
   1. KFold **(method used)**
   2. Regular train with train set, validation and test set
6. Adjust parameters

# Import libs that we'll need for the project

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from numpy import percentile
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import datetime
import matplotlib.dates as mdates
import operator
from sklearn.datasets import make_hastie_10_2
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
import joblib

#file created by us that has functions 
from funcs import * 


#just to remove the warnings
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

## Read the file and analyze data:

In [2]:
data_df = pd.read_csv("Proj1_Dataset.csv", sep=",", decimal=".")

data_df=parse_date_time(data_df)

data_df.to_csv("Proj1_Dataset_changed.csv")


#run to view the data
#plot_data(data_df, temperature=True, CO2=True, PIR=True, light=True)
#scatter_plot(data_df, temperature=True, C02_PIR=False, light=False)

# Remove Outliers and Interpolate data


In [3]:
print(data_df.describe())

             S1Temp        S2Temp        S3Temp       S1Light       S2Light  \
count  10128.000000  10127.000000  10129.000000  10129.000000  10129.000000   
mean      20.424883     20.553337     20.003294     62.218185     58.250864   
std        0.415856      0.663691      0.543616    131.357611    142.220513   
min        0.000000     19.660000    -12.320000      0.000000      0.000000   
25%       20.130000     20.140000     19.650000      0.000000      0.000000   
50%       20.330000     20.340000     19.910000      0.000000      0.000000   
75%       20.672500     20.700000     20.310000     28.000000     30.000000   
max       21.380000     24.000000     21.180000   5500.000000    516.000000   

            S3Light           CO2          PIR1          PIR2       Persons  
count  10129.000000  10128.000000  10129.000000  10129.000000  10129.000000  
mean      80.774706    474.081754      0.107612      0.094382      0.452068  
std      661.501771    204.196690      0.309905      0

While using "describe()" we realized the following:

- The dataframe is 10129 rows long and some values return a count of, for example, 10127 rows, therefore we need to fill in the data where this values are missing 
- If we look closely to the mean and max values of each collumn we realize that there are some outliers due to the discrepancy of the values. Due to the ammount of data we'll simply drop the entire row where the value is found. 



### Drop Outliners

In [4]:
data_df = drop_outliners(data_df, threshold=6,
                         collumn_to_remove_outliers=
                         ["S1Temp", "S2Temp","S3Temp",
                          "CO2","PIR1", "PIR2","S1Light",
                           "S2Light","S3Light"])


Removed outlined from index  56 from  S1Temp with value of: 0.0
Removed outlined from index  1188 from  S3Temp with value of: -12.32
Removed outlined from index  3760 from  S1Light with value of: 5500
Removed outlined from index  2800 from  S3Light with value of: 65536


### Interpolate NaN

In [5]:
# interpolate NaN values
data_df = data_df.interpolate(method='linear', limit_direction='forward', axis=0)


#run to view the data with outliners removed
#plot_data(data_df, temperature=True, CO2=True, PIR=True, light=True)
#scatter_plot(data_df, temperature=True, C02_PIR=False, light=False)

# Normalize data (min max method)

In [6]:
# split into train and test sets
from sklearn.model_selection import train_test_split

# split into train and test sets
train_val , test = train_test_split(data_df, test_size=0.15, shuffle=True, random_state=24)

# split into input and output 
y_train_val = train_val['Persons']
x_train_val = train_val.drop(['Persons'], axis=1)

y_test = test['Persons']
x_test = test.drop(['Persons'], axis=1)      


# Normalize data with Min Max 
x_train_val, x_train_val_min, x_train_val_max = normalize_train_set(x_train_val)
x_test = normalize_test_set(x_test, x_train_val_min, x_train_val_max)


#drop columns we don't need
x_train_val = x_train_val.drop(['DateTime'], axis=1)
x_test = x_test.drop(['DateTime'], axis=1)


# Train the NN (K-Fold method)

In [7]:
# Train scikit NN
from unittest import result
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, r2_score, f1_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import KFold, cross_val_score



#Implementing cross validation
k = 5
kf = KFold(n_splits=k, random_state=None)
model = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=([7,5]), random_state=21, learning_rate_init=0.04)


acc_score = []
press_score = []
rec_score = []

x = x_train_val
y = y_train_val

x_test = x_test
y_test = y_test

for train_index , test_index in kf.split(x):
    x_train, x_val = x.iloc[train_index,:],x.iloc[test_index,:]
    y_train , y_val = y.iloc[train_index] , y.iloc[test_index]
     
    model.fit(x_train,y_train)
    pred_values = model.predict(x_val)
     
    acc = accuracy_score(pred_values , y_val)
    press = precision_score(pred_values,  y_val, average='macro')
    recall = recall_score(pred_values,  y_val, average='macro')
    
    acc_score.append(acc)
    press_score.append(press)
    rec_score.append(recall)
     
avg_acc_score = sum(acc_score)/k
avg_press_score = sum(press_score)/k
avg_recall_score = sum(rec_score)/k

 
print('accuracy of each fold - {}'.format(acc_score))
print('precision of each fold - {}'.format(press_score))
print('recall of each fold - {}'.format(rec_score))


print('Avg accuracy : {}'.format(avg_acc_score))
print('Avg precision : {}'.format(avg_press_score))
print('Avg recall : {}'.format(avg_recall_score))


#test data 
y_pred = model.predict(x_test)

print("\n")
print("Accuracy:",accuracy_score(y_test, y_pred))
print("Precision:",precision_score(y_test, y_pred, average='macro'))
print("Recall:",recall_score(y_test, y_pred, average='macro'))
print("Confusion:\n",confusion_matrix(y_test, y_pred))    
print("\n")


accuracy of each fold - [0.9866434378629501, 0.9825682742591516, 0.9924462521789658, 0.990122022080186, 0.9889599070307961]
precision of each fold - [0.9602620145932838, 0.9530357489599159, 0.9759864435792402, 0.9634234812806242, 0.9650149552119234]
recall of each fold - [0.9600200014484893, 0.958604992167138, 0.9738773970923826, 0.9756049727726689, 0.9649533009096987]
Avg accuracy : 0.98814797868241
Avg precision : 0.9635445287249975
Avg recall : 0.9666121328780756


Accuracy: 0.9874917709019092
Precision: 0.9591486241258729
Recall: 0.9653408976707714
Confusion:
 [[1189    3    0    0]
 [   2   59    0    0]
 [   0    0  150    9]
 [   0    0    5  102]]




# Export the model and min and max values (normalization)

In [8]:
#exports max and min vector to file 
np.savetxt("min.txt", x_train_val_min)
np.savetxt("max.txt", x_train_val_max)

#exports model to file
model_filename = 'model.sav'
joblib.dump(model, model_filename)


['model.sav']

# Test the model with a certain file 

In [5]:
from TestMe import *

In [6]:
TestMe("proffunc.csv")

Removed outlined from index  56 from  S1Temp with value of: 0.0
Removed outlined from index  1188 from  S3Temp with value of: -12.32
Removed outlined from index  3760 from  S1Light with value of: 5500
Removed outlined from index  2800 from  S3Light with value of: 65536


Here's the confunsion matrix obtained
[[7983    8    0    3]
 [   7  455    5    0]
 [   0    1  833   49]
 [   3    0   31  747]]


Macro-Precision:   96.8211018593876 
Macro-Recall:      96.81922410194345 
Macro-F1:          96.81532260028611 


Precision ( 0 persons):   99.8623967975982 %
Precision ( 1 persons):   97.4304068522484 %
Precision ( 2 persons):   94.3374858437146 %
Precision ( 3 persons):   95.64660691421255 %


Recall ( 0 persons):   99.87489052921306 %
Recall ( 1 persons):   98.0603448275862 %
Recall ( 2 persons):   95.8573072497123 %
Recall ( 3 persons):   93.4918648310388 %
