In [1]:
from pandas import read_csv, DataFrame, to_numeric, concat
from numpy import nan
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from yellowbrick.classifier import ConfusionMatrix
from os.path import exists
import scoreValidator #I create this function for managing the score of each model

In [2]:
# Checking for existing files
fileAddress ="data_address.txt"
try:
    if exists(fileAddress):
        with open(fileAddress,'r') as text_file:
            Addrs = text_file.readlines()
            Addrs = [addrs.rstrip('\n') for addrs in Addrs]
            Addrs = [addrs.rstrip(' ') for addrs in Addrs]
        text_file.close()
    else:
        print("The file 'data_address.txt' must exists in the main directory. It contains all the address of pre-processed data to reutilize in each model. Please creat it and run this program again.")
        FileNotFoundError("The file 'data_address.txt' must exists in the main directory. It contains all the address of pre-processed data to reutilize in each model. Please creat it and run this program again.")
except:
    print("An error occurred when reading the file: 'data_address.csv'")
    RuntimeError("An error occurred when reading the file: 'data_address.csv'")

In [3]:
# Reading the three pre-processed files for the model
cleaned2 = DataFrame()
significance_filtered = DataFrame()
var_filtered = DataFrame()
try:
    cleaned2 = read_csv(Addrs[0],sep=',')#This is the data without the low variance, significance filter and outliers expulsion
    significance_filtered = read_csv(Addrs[1],sep=',')#This is the model with the significance filter
    var_filtered = read_csv(Addrs[2],sep=',')#This is the model with the low variance filter
except:
    print("Fatal Error: the pre-processed files was not found.")
    FileNotFoundError("Fatal Error: the pre-processed files was not found.")

In [4]:

predictors_cleaned2 = cleaned2.iloc[:,1:]
class_cleaned2 = cleaned2.iloc[:,0]
#-----------------------------------------------
predictors_significance = significance_filtered.iloc[:,1:]
predictors_variance = var_filtered.iloc[:,1:]
#This class can be the same for both predictor_significance and predictor_variance because the number of rows didn't changed on those two filters
class_filtered = var_filtered.iloc[:,0]

In [5]:
#The machine will be trained with the previous years partial data and them whene model works just fine it will be trained with the whole previous years data set and tested with the presente year data set
X_train_cleaned2, X_test_cleaned2, y_train_cleaned2, y_test_cleaned2 = train_test_split(predictors_cleaned2,class_cleaned2,test_size=0.3,random_state=0)
#Splitting significance variables
X_train_significance, X_test_significance, y_train_significance, y_test_significance = train_test_split(predictors_significance,class_filtered,test_size=0.3,random_state=0)
#splitting variance variables
X_train_variance, X_test_variance, y_train_variance, y_test_variance = train_test_split(predictors_variance,class_filtered,test_size=0.3,random_state=0)


In [6]:
#Instantiating an object for each data set
log_regression_cleaned2 = LogisticRegression()
log_regression_significance = LogisticRegression()
log_regression_variance = LogisticRegression()

In [7]:
scaler = StandardScaler()
X_train_cleaned2 = scaler.fit_transform(X_train_cleaned2)
X_test_cleaned2 = scaler.fit_transform(X_test_cleaned2)
X_train_variance = scaler.fit_transform(X_train_variance)
X_test_variance = scaler.fit_transform(X_test_variance)
X_train_significance = scaler.fit_transform(X_train_significance)
X_test_significance = scaler.fit_transform(X_test_significance)

In [8]:
log_regression_cleaned2.fit(X_train_cleaned2,y_train_cleaned2)
log_regression_significance.fit(X_train_significance,y_train_significance)
log_regression_variance.fit(X_train_variance,y_train_variance)

In [9]:
prediction_cleaned2 = log_regression_cleaned2.predict(X_test_cleaned2)
prediction_significance = log_regression_significance.predict(X_test_significance)
prediction_variance = log_regression_variance.predict(X_test_variance)

In [10]:
#Confusion matrix without filtering
confusion_matrix_cleaned2 = confusion_matrix(y_test_cleaned2,prediction_cleaned2)
confusion_matrix_cleaned2

array([[7299,   25],
       [  26,   26]], dtype=int64)

In [11]:
#Confusion matrix Significance
confusion_matrix_significance = confusion_matrix(y_test_significance,prediction_significance)
confusion_matrix_significance

array([[7296,    9],
       [  38,   14]], dtype=int64)

In [12]:
#Confusion matrix Variance
confusion_matrix_variance = confusion_matrix(y_test_variance,prediction_variance)
confusion_matrix_variance

array([[7292,   13],
       [  42,   10]], dtype=int64)

In [13]:
#As espectected the most filtered one is the most precise
print("Score of the model Logistic Regression without dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_cleaned2))
print("----------------------------------------------------------------\n")
print('\n')
print("Score of the model Logistic Regression with Trees Classifier dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_significance))
print("----------------------------------------------------------------\n")
print('\n')
print("Score of the model Logistic Regression with low variance filter dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_variance))
print("----------------------------------------------------------------\n")

Score of the model Logistic Regression without dimensionality reduction:

(The lower the best)


13900.0
----------------------------------------------------------------



Score of the model Logistic Regression with Trees Classifier dimensionality reduction:

(The lower the best)


19440.0
----------------------------------------------------------------



Score of the model Logistic Regression with low variance filter dimensionality reduction:

(The lower the best)


21380.0
----------------------------------------------------------------



In [14]:
#Based on the previous result we choose to use the low variance dimensionality reduction for the next step
# Checking for existing files
try:
    present_year_raw = read_csv("Data/air_system_present_year.csv")    
except:
    print("An error occurred when reading the file: 'data_address.csv'")
    RuntimeError("An error occurred when reading the file: 'data_address.csv'")
#Cleaning input sample
X_present_year = DataFrame(present_year_raw[var_filtered.columns]).iloc[:,1:].replace("na",nan).apply(to_numeric)
y_present_year = DataFrame(present_year_raw[var_filtered.columns]).iloc[:,0]
new_present_year_raw = concat([y_present_year,X_present_year],axis=1).dropna() 
X_present_year = new_present_year_raw.iloc[:,1:]
y_present_year = new_present_year_raw.iloc[:,0]
log_regression_variance = LogisticRegression()
scaler = StandardScaler()
X_present_year = scaler.fit_transform(X_present_year)
predictors_variance = scaler.fit_transform(predictors_variance)
#Training the model with the whole previous years data
log_regression_variance.fit(predictors_variance,class_filtered)
prediction = log_regression_variance.predict(X_present_year)
confusion_matrix_var = confusion_matrix(y_present_year,prediction)
print(confusion_matrix_var)
print("Score of the model Logistic Regression with low variance filter dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var))
print("----------------------------------------------------------------\n")
print("Score rate of the model Logistic Regression with low variance filter dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var)/len(y_present_year))
print("----------------------------------------------------------------\n")

[[11778    17]
 [   58    21]]
Score of the model Logistic Regression with low variance filter dimensionality reduction:

(The lower the best)


29695.0
----------------------------------------------------------------

Score rate of the model Logistic Regression with low variance filter dimensionality reduction:

(The lower the best)


2.5008421761832573
----------------------------------------------------------------



In [17]:
#Optimizing the best model
#Naive Bayes happens to be the best model based in our accuracy definition, so now is the momento for optimizing the best this model and get the best results
from accuracyMeassure import accuracyMeassure
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from numpy import ndarray

def accuracyMeassure(y_true:ndarray,y_predict:ndarray)->float:
    confusion_matrix_var = confusion_matrix(y_true,y_predict)
    return scoreValidator.score(confution_matrix_var=confusion_matrix_var)
#Defining a custome score
custom_score = make_scorer(accuracyMeassure)
#Defining parameters to be tested, we let it in blanck because we want it to auto-change them
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'penalty': ['l1', 'l2'], 
    'solver': ['liblinear', 'saga']
    }

grid_searcher = GridSearchCV(estimator=LogisticRegression(),param_grid=param_grid,cv=10,scoring=custom_score)
grid_searcher.fit(predictors_variance,class_filtered)

print("Best parameters found:\n")
print(grid_searcher.best_params_)
print("\n")
print("Best scores found:\n")
print(grid_searcher.best_score_)
print("\n")

prediction = grid_searcher.best_estimator_.predict(X_present_year)
confusion_matrix_var = confusion_matrix(y_present_year,prediction)
print(confusion_matrix_var)
print("Score of the model Logistic Regression with significance dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var))
print("----------------------------------------------------------------\n")
print("Score rate of the model Logistic Regression with significans dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var)/len(y_present_year))
print("----------------------------------------------------------------\n")



Best parameters found:

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}


Best scores found:

7734.5


[[11779    16]
 [   58    21]]
Score of the model Logistic Regression with significance dimensionality reduction:

(The lower the best)


29685.0
----------------------------------------------------------------

Score rate of the model Logistic Regression with significans dimensionality reduction:

(The lower the best)


2.5
----------------------------------------------------------------



In [18]:
#Based on the previous result we choose to use the low variance dimensionality reduction for the next step
# Checking for existing files
try:
    present_year_raw = read_csv("Data/air_system_present_year.csv")    
except:
    print("An error occurred when reading the file: 'data_address.csv'")
    RuntimeError("An error occurred when reading the file: 'data_address.csv'")
#Cleaning input sample
X_present_year = DataFrame(present_year_raw[significance_filtered.columns]).iloc[:,1:].replace("na",nan).apply(to_numeric)
y_present_year = DataFrame(present_year_raw[significance_filtered.columns]).iloc[:,0]
new_present_year_raw = concat([y_present_year,X_present_year],axis=1).dropna() 
X_present_year = new_present_year_raw.iloc[:,1:]
y_present_year = new_present_year_raw.iloc[:,0]
log_regression_significance = LogisticRegression()
scaler = StandardScaler()
X_present_year = scaler.fit_transform(X_present_year)
predictors_significance = scaler.fit_transform(predictors_significance)
#Training the model with the whole previous years data
log_regression_significance.fit(predictors_significance,class_filtered)
prediction = log_regression_significance.predict(X_present_year)
confusion_matrix_var = confusion_matrix(y_present_year,prediction)
print(confusion_matrix_var)
print("Score of the model Logistic Regression with low variance filter dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var))
print("----------------------------------------------------------------\n")
print("Score rate of the model Logistic Regression with low variance filter dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var)/len(y_present_year))
print("----------------------------------------------------------------\n")

[[6516   12]
 [  55   11]]
Score of the model Logistic Regression with low variance filter dimensionality reduction:

(The lower the best)


27895.0
----------------------------------------------------------------

Score rate of the model Logistic Regression with low variance filter dimensionality reduction:

(The lower the best)


4.23036093418259
----------------------------------------------------------------



In [19]:
#Optimizing the best model
#Naive Bayes happens to be the best model based in our accuracy definition, so now is the momento for optimizing the best this model and get the best results
from accuracyMeassure import accuracyMeassure
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from numpy import ndarray

def accuracyMeassure(y_true:ndarray,y_predict:ndarray)->float:
    confusion_matrix_var = confusion_matrix(y_true,y_predict)
    return scoreValidator.score(confution_matrix_var=confusion_matrix_var)
#Defining a custome score
custom_score = make_scorer(accuracyMeassure)
#Defining parameters to be tested, we let it in blanck because we want it to auto-change them
param_grid = {
    'C': [0.1, 1, 10, 100], 
    'penalty': ['l1', 'l2'], 
    'solver': ['liblinear', 'saga']
    }

grid_searcher = GridSearchCV(estimator=LogisticRegression(),param_grid=param_grid,cv=10,scoring=custom_score)
grid_searcher.fit(predictors_significance,class_filtered)

print("Best parameters found:\n")
print(grid_searcher.best_params_)
print("\n")
print("Best scores found:\n")
print(grid_searcher.best_score_)
print("\n")

prediction = grid_searcher.best_estimator_.predict(X_present_year)
confusion_matrix_var = confusion_matrix(y_present_year,prediction)
print(confusion_matrix_var)
print("Score of the model Logistic Regression with significance dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var))
print("----------------------------------------------------------------\n")
print("Score rate of the model Logistic Regression with significans dimensionality reduction:\n")
print("(The lower the best)")
print("\n")
print(scoreValidator.score(confusion_matrix_var)/len(y_present_year))
print("----------------------------------------------------------------\n")



Best parameters found:

{'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}


Best scores found:

7674.0


[[6516   12]
 [  56   10]]
Score of the model Logistic Regression with significance dimensionality reduction:

(The lower the best)


28370.0
----------------------------------------------------------------

Score rate of the model Logistic Regression with significans dimensionality reduction:

(The lower the best)


4.302396117682742
----------------------------------------------------------------

