In [100]:
import pandas as pd
import random as rnd
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline

In [101]:
#ML tools:
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [98]:
prepared_const = pd.read_csv("./Data/prepared_const_CSV")
scaled_const = pd.read_csv("./Data/scaled_const_CSV")

In [108]:
#Let's split the dataset and train the model:
y = scaled_const['DELAYED']
X = scaled_const.loc[:,'built_area':'OTHERS']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.15,random_state=1,stratify=y)

train = pd.concat([X_train,y_train],axis=1)
test = pd.concat([X_test,y_test],axis=1)
train.to_csv('./prototrain_csv')
test.to_csv('./prototest_csv')

In [105]:
train_file = "./Data/prototrain_CSV"
test_file = "./Data/prototest_CSV"

train = pd.read_csv("./Data/prototrain_CSV") 
test = pd.read_csv("./Data/prototest_CSV") 

In [111]:
train

Unnamed: 0,built_area,modul_price,weeks_duration,DETACHED,COLLECTIVE,COMMERCIAL,OTHERS,DELAYED
198,0.426125,-1.357797,-0.929886,0,1,0,0,1
457,-0.999290,-0.011105,0.364784,0,1,0,0,1
2112,1.409302,0.772159,-0.120717,0,0,1,0,1
2739,2.147799,-1.180747,0.243409,0,1,0,0,0
2601,0.123445,1.232673,0.243409,0,1,0,0,1
199,-0.344412,1.356397,1.254869,0,0,0,1,1
2464,-0.087555,-0.877116,0.931202,0,0,0,1,0
2060,-0.398011,0.058296,-1.051261,0,1,0,0,1
2191,-1.120664,0.368674,0.364784,0,1,0,0,0
1850,-1.196256,-1.495552,0.607534,0,1,0,0,1


In [10]:
list(prepared_const)

['Unnamed: 0',
 'built_area',
 'modul_price',
 'weeks_duration',
 'DETACHED',
 'COLLECTIVE',
 'COMMERCIAL',
 'OTHERS',
 'DELAYED']

In [11]:
y[y_test.index.values].head(5)

275     1
344     1
2355    1
10      1
1582    1
Name: DELAYED, dtype: int64

## Test and combination of ML models:
Due to the impossibility for the client (construction managers) most of the times, to manipulate every parameter in the construction project, it is useless to recomend global optimal parameters. So the nature of this problem force us to focus in local optima rather than on globals. So, we have to refine the models to avoid overfitting, but taking care of not "softening" the models too much. With the confussion matrix in mind, so we are open to have a model with many False Positives (of potentially delayed construction projects but finally not delayed). This way we won't take the risk of regularizing the models too much nd loosing valuable information.

In [106]:
#K-NEIGHBORS:
knn =KNeighborsClassifier(n_neighbors=10)
knn.fit(X,y)
y_pred_kn = knn.predict(X_test)
print("KNEIGHBORS REGRESSION MODEL: ")
print("Basic scoring: " +str(knn.score(X_test,y_test)))
print("Normalized accuracy: " +str(accuracy_score(y_test,y_pred_kn)))
print("Net accuracy: " +str(accuracy_score(y_test,y_pred_kn, normalize = False)) + 
      " over " + str(y_test.size) + " samples.")

ValueError: Found input variables with inconsistent numbers of samples: [2403, 2402]

In [52]:
#LOGISTIC:
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred_lg = logreg.predict(X_test)
print("LOGISTIC REGRESSION MODEL: ")
print("Basic scoring: " +str(logreg.score(X_test,y_test)))
print("Normalized accuracy: " +str(accuracy_score(y_test,y_pred_lg)))
print("Net accuracy: " +str(accuracy_score(y_test,y_pred_lg, normalize = False)) + 
      " over " + str(y_test.size) + " samples.")


LOGISTIC REGRESSION MODEL: 
Basic scoring: 0.787081339713
Normalized accuracy: 0.787081339713
Net accuracy: 329 over 418 samples.


In [53]:
#RANDOM FOREST CLASSIF:
ranfor = RandomForestClassifier(max_depth=2, random_state=0)
ranfor.fit(X, y)
y_pred_rf = ranfor.predict(X_test)
print("RANDOM FOREST MODEL: ")
print("Basic scoring: " +str(ranfor.score(X_test,y_test)))
print("Normalized accuracy: " +str(accuracy_score(y_test,y_pred_rf)))
print("Net accuracy: " +str(accuracy_score(y_test,y_pred_rf, normalize = False)) + 
      " over " + str(y_test.size) + " samples.")

RANDOM FOREST MODEL: 
Basic scoring: 0.801435406699
Normalized accuracy: 0.801435406699
Net accuracy: 335 over 418 samples.


# Evaluating ML tools performance and combination:
As we will see, the performance and predictions of random forest and logistic regression tools are similar, probably because of the scarcity of samples. We will combine these tools only to score the confidence of the prediction, as they will predict the same labels most of the times.

In [96]:
#Coincidence between models:
print("LOGIST VS RANDFOR: " +str(accuracy_score(y_pred_lg,y_pred_rf)))
print("LOGIST VS K-NEIGH: " +str(accuracy_score(y_pred_lg,y_pred_kn)))
print("RANDFOR VS K-NEIGH: " +str(accuracy_score(y_pred_rf,y_pred_kn)))

LOGIST VS RANDFOR: 0.990453460621
LOGIST VS K-NEIGH: 0.935560859189
RANDFOR VS K-NEIGH: 0.935560859189


# Observations disagreement between algorithms.

In [32]:
X

Unnamed: 0,built_area,modul_price,weeks_duration,DETACHED,COLLECTIVE,COMMERCIAL,OTHERS
0,-1.436540,2.205662,-0.627210,0,0,0,1
1,-1.420652,-0.636344,-1.237346,0,0,0,1
2,-1.445182,-0.893928,-2.050862,1,0,0,0
3,-1.469212,2.691423,-0.830589,1,0,0,0
4,-1.499916,-0.819468,-1.684780,0,0,0,1
5,-1.494325,-0.623508,-1.115319,1,0,0,0
6,-1.447921,0.405212,-1.522077,1,0,0,0
7,-1.457428,-0.903396,-0.423831,1,0,0,0
8,-1.388011,-1.092141,-1.278022,0,0,0,1
9,-1.458532,-0.672322,-1.969510,1,0,0,0


In [58]:
to_predict = prepared_const.iloc[159,1:-1].values.reshape(1, -1)

#Single prediction:
prediction = knn.predict(to_predict)

#Triple prediction:
print(to_predict)
print([knn.predict(to_predict),logreg.predict(to_predict),ranfor.predict(to_predict)])

[[-0.27671178  0.76841355  1.36590222  0.          1.          0.          0.        ]]
[array([1], dtype=int64), array([1], dtype=int64), array([1], dtype=int64)]


In [64]:
mask.loc[26]

index    26
kn        0
lg        1
rf        1
Name: 26, dtype: int64

In [75]:
ykn = pd.DataFrame(y_pred_kn).rename(columns={0:"kn"}).reset_index()
ylg= pd.DataFrame(y_pred_lg).rename(columns={0:"lg"})
yrf= pd.DataFrame(y_pred_rf).rename(columns={0:"rf"})
y_concat = pd.concat([ykn,ylg,yrf],axis=1)

diff1 = y_concat['kn'] != y_concat['lg'] 
diff2 = y_concat['rf'] != y_concat['lg'] 

mask = y_concat[diff1 | diff2]

In [77]:
scaled_voting_data = pd.concat([prepared_const,mask],axis=1)
scaled_voting_data[voting_data['index'].notnull()].head(20)

Unnamed: 0.1,Unnamed: 0,built_area,modul_price,weeks_duration,DETACHED,COLLECTIVE,COMMERCIAL,OTHERS,DELAYED,index,kn,lg,rf
0,0,-1.43654,2.205662,-0.62721,0,0,0,1,1,0.0,0.0,1.0,1.0
24,24,-1.481138,-1.638691,-0.667886,1,0,0,0,1,24.0,0.0,1.0,1.0
26,26,-1.490713,0.647004,-1.481401,0,0,0,1,0,26.0,0.0,1.0,1.0
47,47,-1.443567,1.639408,-1.481401,1,0,0,0,1,47.0,0.0,1.0,1.0
53,53,-1.45705,-0.134819,-1.440725,1,0,0,0,1,53.0,0.0,1.0,1.0
56,56,-1.458233,0.491059,-1.766131,0,0,0,1,1,56.0,1.0,0.0,1.0
59,59,-1.440519,-1.982527,-0.62721,0,0,0,1,1,59.0,0.0,1.0,1.0
97,97,-1.428983,1.038266,-1.644104,0,1,0,0,1,97.0,1.0,0.0,1.0
99,99,-1.415967,0.183116,-1.359374,0,1,0,0,1,99.0,0.0,1.0,1.0
121,121,-1.411292,2.497859,-2.050862,0,1,0,0,1,121.0,0.0,1.0,1.0


In [78]:
nonscaled_voting_data = pd.concat([nonscaled_const,mask],axis=1)
nonscaled_voting_data[nonscaled_voting_data['index'].notnull()].head(20)

Unnamed: 0.1,Unnamed: 0,built_area,modul_price,weeks_duration,DETACHED,COLLECTIVE,COMMERCIAL,OTHERS,DELAYED,index,kn,lg,rf
0,0,404.575885,858.830343,34,1,0,0,0,1,0.0,0.0,1.0,1.0
24,24,128.46524,499.061978,16,1,0,0,0,1,24.0,0.0,1.0,1.0
26,26,53.713501,1210.618283,32,1,0,0,0,1,26.0,0.0,1.0,1.0
47,47,362.103873,730.090618,45,1,0,0,0,0,47.0,0.0,1.0,1.0
53,53,612.067189,1097.471158,53,0,1,0,0,1,53.0,0.0,1.0,1.0
56,56,326.157574,1466.504586,29,1,0,0,0,1,56.0,1.0,0.0,1.0
59,59,953.922208,677.046696,31,0,0,0,1,0,59.0,0.0,1.0,1.0
97,97,566.760119,1286.818434,20,0,0,0,1,0,97.0,1.0,0.0,1.0
99,99,156.277128,832.459239,18,1,0,0,0,1,99.0,0.0,1.0,1.0
121,121,172.121629,1589.806896,26,1,0,0,0,1,121.0,0.0,1.0,1.0
