## Import

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import (KFold, cross_val_score, train_test_split, cross_validate)
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import (Pipeline, make_pipeline)
from sklearn import preprocessing
from sklearn import linear_model
from sklearn import datasets
from sklearn.model_selection import cross_val_score
import time
from sklearn import preprocessing
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import (LinearRegression, Ridge, Lasso,LogisticRegression, LassoCV)

## Data preprocessing

In [2]:
# Dataset downloaded from https://www.kaggle.com/jsphyg/weather-dataset-rattle-package
train = pd.read_csv('./weatherAUS.csv')

# Dataset looks like this
print(train.head(5))
print(train.shape)

# Target is whether it will rain tomorrow


# Above link recommends to exclude column RISK_MM
# Since I will predict rain tomorrow in Austrailia, any city in Austrailia would be fine. So I drop column Location
# In this example, date is just index for samples. So I dropped column Date.
train=train.drop(['Location','RISK_MM','Date'],axis=1)

# There are too many samples having NaN in columns Sunshine, Evaporation, Cloud3pm, Cloud9pm.
# So I dropped four columns to preserve as many columns without NaN as possible.
print(train.isnull().sum().sort_values(ascending=False))
train=train.drop(['Sunshine','Evaporation','Cloud3pm','Cloud9am'],axis=1)

# Then I deleted samples having NaN in any column.
for col in train.columns:
    temp=train.shape[0]
    train = train[pd.notnull(train[col])]
    
# Change type of RainToday from string to int
train['RainTodayBool'] = train['RainToday'].map( {'No': 0, 'Yes':1} ).astype(int)

# RainTomorrow is our target.
y=train['RainTomorrow'].map({'No':0, 'Yes':1}).astype(int)
train = train.drop(['RainTomorrow','RainToday'],axis=1)

# Change information about direction properly
train=pd.get_dummies(train,columns=['WindGustDir','WindDir9am','WindDir3pm'])

# Since some columns have relatively huge values, scale all columns to [0,1] range.
scaler = preprocessing.MinMaxScaler()
scaler.fit(train)
train = pd.DataFrame(scaler.transform(train), index=train.index, columns=train.columns)

# Save training set for later usage
ready=train
print(ready.shape)

         Date Location  MinTemp  MaxTemp  Rainfall  Evaporation  Sunshine  \
0  2008-12-01   Albury     13.4     22.9       0.6          NaN       NaN   
1  2008-12-02   Albury      7.4     25.1       0.0          NaN       NaN   
2  2008-12-03   Albury     12.9     25.7       0.0          NaN       NaN   
3  2008-12-04   Albury      9.2     28.0       0.0          NaN       NaN   
4  2008-12-05   Albury     17.5     32.3       1.0          NaN       NaN   

  WindGustDir  WindGustSpeed WindDir9am  ... Humidity3pm  Pressure9am  \
0           W           44.0          W  ...        22.0       1007.7   
1         WNW           44.0        NNW  ...        25.0       1010.6   
2         WSW           46.0          W  ...        30.0       1007.6   
3          NE           24.0         SE  ...        16.0       1017.6   
4           W           41.0        ENE  ...        33.0       1010.8   

   Pressure3pm  Cloud9am  Cloud3pm  Temp9am  Temp3pm  RainToday  RISK_MM  \
0       1007.1       8

## Find proper max_depth

In [21]:
## Select proper max_depth.
## I used default learning rate 0.1 this time, but I will change it later.

for i in range(1,7):
    t0=time.time()
    gbrt = GradientBoostingClassifier(random_state=0, max_depth=i)
    scores = cross_val_score(gbrt, train, y, cv=10)
    print('max_depth = ',i,'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))


max_depth =  1 Time taken :  46.6064088344574
Mean :  0.8443748290340446  | Max :  0.8541316092463024  | Min :  0.8337908438855929  | Std :  0.006443277139344468
max_depth =  2 Time taken :  108.06675839424133
Mean :  0.8469252641257705  | Max :  0.8630889125044279  | Min :  0.8320198352962012  | Std :  0.009421325282729232
max_depth =  3 Time taken :  193.5566053390503
Mean :  0.8479613629672512  | Max :  0.8662770102727595  | Min :  0.8339679447445321  | Std :  0.009591014223238216
max_depth =  4 Time taken :  315.1361620426178
Mean :  0.8499715557443455  | Max :  0.8681367339709529  | Min :  0.8360046046223324  | Std :  0.009876666623623097
max_depth =  5 Time taken :  485.64620900154114
Mean :  0.8495110731218173  | Max :  0.8696422245837762  | Min :  0.8362702559107412  | Std :  0.010319239625865995
max_depth =  6 Time taken :  732.0262486934662
Mean :  0.8496438862144409  | Max :  0.8727417640807651  | Min :  0.8340564951740016  | Std :  0.011245182528709251


## Find proper min_samples_split

In [None]:
## Select proper min_samples_split.
## I used default learning rate 0.1 this time, but I will change it later.
## I also used max_depth=4 because it maximizes cross_validation score.

min_samples_splits=[2,4,8,16,32]
for i in min_samples_splits:
    t0=time.time()
    gbrt = GradientBoostingClassifier(random_state=0, max_depth=4, min_samples_split=i)
    scores = cross_val_score(gbrt, train, y, cv=10)
    print('min_samples_split = ',i,'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))

## Find proper min_samples_leaf

In [None]:
## Select proper min_samples_leaf.
## I used default learning rate 0.1 this time, but I will change it later.
## I also used max_depth=4, min_samples_split=8 because these maximize cross_validation score.

min_samples_leaves=[1,2,3,4,5]
for i in min_samples_leaves:
    t0=time.time()
    gbrt = GradientBoostingClassifier(random_state=0, max_depth=4, min_samples_split=8,min_samples_leaf=i)
    scores = cross_val_score(gbrt, train, y, cv=10)
    print('min_samples_leaf = ',i,'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))

## Find proper max_features

In [3]:
## Select proper max_features.
## I used default learning rate 0.1 this time, but I will change it later.
## I also used max_depth=4, min_samples_split=8, min_samples_leaf=1 
## because these maximize cross_validation score.

max_features_num=['None','7','8','9','sqrt','log2','auto']
for i in max_features_num:
    t0=time.time()
    gbrt = GradientBoostingClassifier(random_state=0, max_depth=4, min_samples_split=8,min_samples_leaf=1,max_features=i)
    scores = cross_val_score(gbrt, train, y, cv=10)
    print('max_features = ',i,'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))

max_features =  sqrt Time taken :  56.50385499000549
Mean :  0.8467923977011876  | Max :  0.8662770102727595  | Min :  0.832905339590897  | Std :  0.009545501647487839
max_features =  log2 Time taken :  45.690943479537964
Mean :  0.8451098571999326  | Max :  0.8629117959617428  | Min :  0.8328167891614274  | Std :  0.00889301742825981


In [8]:
## Select proper subsample to decide whether to use stochastic gradient boosting.
## I used default learning rate 0.1 this time, but I will change it later.
## I also used max_depth=4, min_samples_split=8, min_samples_leaf=1, max_features='None' 
## because these maximize cross_validation score.

subsample=[1,0.95,0.9]
for i in subsample:
    t0=time.time()
    gbrt = GradientBoostingClassifier(random_state=0, max_depth=4, min_samples_split=8,
                                      min_samples_leaf=1,subsample=i)
    scores = cross_val_score(gbrt, train, y, cv=10)
    print('subsample = ',i,'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))

subsample =  0.9 Time taken :  320.0865890979767
Mean :  0.849519942280691  | Max :  0.8686680835990082  | Min :  0.8369786593464978  | Std :  0.00979312844167559


In [3]:
## Select proper subsample to decide whether to use stochastic gradient boosting.
## I used default learning rate 0.1 this time, but I will change it later.
## I also used max_depth=4, min_samples_split=8, min_samples_leaf=1, max_features='None' 
## because these maximize cross_validation score.

learning_rates=[0.05,0.08,0.1]
estimators=[200,125,100]
for i in range(len(learning_rates):
    t0=time.time()
    gbrt = GradientBoostingClassifier(random_state=0, max_depth=4, min_samples_split=8,
                                      min_samples_leaf=1, learning_rate=learning_rates[i], n_estimators=estimators[i])
    scores = cross_val_score(gbrt, train, y, cv=10)
    print('learning_rate = ',i,'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))

learning_rate =  0.05 Time taken :  634.7777450084686
Mean :  0.8494225030862761  | Max :  0.8676939426142402  | Min :  0.8362702559107412  | Std :  0.009618764510497075


In [13]:
## Select proper subsample to decide whether to use stochastic gradient boosting.
## I used default learning rate 0.1 this time, but I will change it later.
## I also used max_depth=4, min_samples_split=8, min_samples_leaf=1, max_features='None' 
## because these maximize cross_validation score.

learning_rates=[0.08]
for i in learning_rates:
    t0=time.time()
    gbrt = GradientBoostingClassifier(random_state=0, max_depth=4, min_samples_split=8,
                                      min_samples_leaf=1, learning_rate=i, n_estimators=125)
    scores = cross_val_score(gbrt, train, y, cv=10)
    print('learning_rate = ',i,'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))

learning_rate =  0.08 Time taken :  388.978951215744
Mean :  0.8496350272533357  | Max :  0.8676053843428976  | Min :  0.8359160541928629  | Std :  0.009674743505402346


In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
ready=train
t0=time.time()
models=[]
models_name=[]
def create_models(alg,name):
    models.append(alg)
    models_name.append(name)
create_models(LogisticRegression(random_state=0,max_iter=1000,solver="liblinear"),'Logistic Regression')    
create_models(DecisionTreeClassifier(random_state=0,max_depth=4),'Decision Tree')
create_models(RandomForestClassifier(random_state=0,max_depth=4,n_estimators=10,max_features=1),'Random Forest')
create_models(AdaBoostClassifier(),'AdaBoost')

for i in range(len(models)):
    t0=time.time()
    scores=cross_val_score(models[i],train,y,cv=10)
    print(models_name[i],'Time taken : ', time.time()-t0)
    print('Mean : ',np.mean(scores),' | Max : ',np.max(scores), ' | Min : ',np.min(scores), ' | Std : ',np.std(scores))

Logistic Regression Time taken :  8.855034351348877
Mean :  0.8467394235042788  | Max :  0.8672511512575275  | Min :  0.817851766581068  | Std :  0.013450573858038795
Decision Tree Time taken :  3.3853774070739746
Mean :  0.8381937769367948  | Max :  0.853600212558675  | Min :  0.8207739307535642  | Std :  0.010725376318694215
Random Forest Time taken :  1.6386339664459229
Mean :  0.7784458715479505  | Max :  0.7784961473740147  | Min :  0.7784272051009564  | Std :  1.891673118456956e-05
AdaBoost Time taken :  58.5219464302063
Mean :  0.8431263511068219  | Max :  0.8633545873184555  | Min :  0.8223678384840166  | Std :  0.01203407899902826
