In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from xgboost import  XGBClassifier
from sklearn.svm import SVC
import sklearn.metrics as metrics

%matplotlib inline

In [2]:
#loading combined dataset

data = pd.read_csv('../../data/combined_kev.csv', parse_dates=['date'])

In [3]:
#checking for null values
data.isnull().any().any()

False

In [4]:
#checking for duplicates
data.duplicated().any()

False

In [5]:
# convert sunrise & sunset to datetime
sunrise = pd.to_datetime(data['sunrise'], format='%H%M')

# fix entries - round off 60min to the next hour
sunset = pd.to_datetime(data['sunset'].map(lambda x: 1800 if x == 1760 else x), format='%H%M')

# convert to seconds from start of day to be consistent input into model
data['sunrise'] = (sunrise - sunrise.dt.normalize()).dt.seconds
data['sunset'] = (sunset - sunset.dt.normalize()).dt.seconds

# add total sunlight time feature
data['total_sunlight_time'] = (sunset - sunrise).dt.seconds

In [6]:
#checking finalised dataset before modelling
data.head()

Unnamed: 0,date,species,wnvpresent,trap,latitude,longitude,species_ord,tmax,tmin,tavg,...,rel_humid_lag14,rel_humid_lag28,avgspeed_lag5,avgspeed_lag14,avgspeed_lag28,preciptotal_lag5,preciptotal_lag14,preciptotal_lag28,week_number,total_sunlight_time
0,2007-05-29,CULEX PIPIENS,0,T096,41.731922,-87.677512,2.0,88,62,75,...,40.692805,38.885597,7.51,9.682143,10.2,0.15,0.069286,0.055357,22,53760
1,2007-05-29,CULEX PIPIENS/RESTUANS,0,T086,41.688324,-87.676709,2.0,88,62,75,...,40.692805,38.885597,7.51,9.682143,10.2,0.15,0.069286,0.055357,22,53760
2,2007-05-29,CULEX PIPIENS/RESTUANS,0,T048,41.867108,-87.654224,2.0,88,62,75,...,40.692805,38.885597,7.51,9.682143,10.2,0.15,0.069286,0.055357,22,53760
3,2007-05-29,CULEX PIPIENS/RESTUANS,0,T129,41.891126,-87.61156,2.0,88,62,75,...,40.692805,38.885597,7.51,9.682143,10.2,0.15,0.069286,0.055357,22,53760
4,2007-05-29,CULEX PIPIENS/RESTUANS,0,T050,41.919343,-87.694259,2.0,88,62,75,...,40.692805,38.885597,7.51,9.682143,10.2,0.15,0.069286,0.055357,22,53760


### Feature selection and Preprocessing Workflow
As we have decided to optimise for accuracy (ROC-AUC) instead of opting for more interpretability, we will use:

**Preprocessing**

1) Polynomial features (degree of 2) to increase the number of features. <br>
2) Train-test-split <br>
3) Standard Scaler <br>

**Feature Selection**

We will use Pearson's correlation with 4 different cutoff points for correlation (0, 0.01, 0.05, 0.1). This will help us gradually reduce the number of features until we find a feature set that gives us the best test ROC-AUC score.

### Preprocessing

In [7]:
#declaring X and y variables
X = data.drop(columns=['date', 'species', 'trap', 'wnvpresent'])
y = data['wnvpresent']

In [8]:
#polynomial features the independent variables
poly = PolynomialFeatures(include_bias=False, degree=2)
X_poly = poly.fit_transform(X)
X_poly.shape

(8475, 1274)

In [9]:
# making polynomial features into a dataframe
X_poly = pd.DataFrame(X_poly, columns=poly.get_feature_names(X.columns))

In [10]:
#train-test-split
X_train, X_test, y_train, y_test = train_test_split(X_poly,
                                                    y, 
                                                    random_state=42,
                                                    test_size=0.3, 
                                                    stratify=y)

In [11]:
#Standard scale the X_train and X_test variables
ss = StandardScaler()
X_train_sc = ss.fit_transform(X_train)
X_test_sc = ss.transform(X_test)

### Modelling

We will find the best parameters using GridSearch on these models:

1) **Support Vector Classifier**: Effective in high dimensional spaces. Uses a subset of training points in the decision function.<br>
2) **Logistic Regression**: Effective in binary classification. <br>
3) **Gradient boost**: Produces a prediction model in the form of an ensemble of weak prediction models, typically decision trees. Uses gradient descent algorithm.<br>
4) **Xg boost**: Similar to gradient boost, but xgboost uses a more regularized model formalization to control over-fitting.<br>
5) **Random Forest**: Constructs decision trees at training time and outputting the class that is the mode of the classes.<br> 
6) **Extra Trees**: Similar to Random Forest. However, the splits of the trees in the Random Forest are deterministic. It is random for extratrees. <br>

In [12]:
# parameters for logistic regression

logreg = LogisticRegression(
    solver='liblinear',
    max_iter=1000, 
    random_state=42,
)

logreg_params = {
    'clf__penalty': ['l1', 'l2'],
    'clf__C': [0.1, 1, 1.5, 2.5]
}

In [13]:
# Gradient Boost
gb = GradientBoostingClassifier(random_state=42)

gb_params = {
    'clf__learning_rate': [0.05, 0.1],
    'clf__max_depth': [2, 3]
}

In [14]:
# Support Vector Machine
svc = SVC(probability=True, random_state=42)

svc_params = {
    'clf__kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'clf__C': [2, 5, 10]
}

In [15]:
# Xg boost

xgb = XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='auc', random_state=42)

xgb_params = {
    'clf__max_depth': [3, 4, 5],
    'clf__gamma' : [ 0.0, 0.2 , 0.4 ],
    'clf__eta' : [0.05, 0.15, 0.25]
}


In [16]:
#random forest

rf = RandomForestClassifier(random_state=42, class_weight='balanced')


rf_params = {
    'clf__max_depth': [3, 4, 5],
    'clf__n_estimators': [80, 100, 150],
    'clf__min_samples_split': [2, 4]
}

In [17]:
#extra trees
et = ExtraTreesClassifier(random_state=42, class_weight='balanced')

et_params = {
    'clf__max_depth': [3, 4, 5],
    'clf__n_estimators': [80, 100, 150],
    'clf__min_samples_split': [2, 4]
}


In [18]:
# Sorting by correlation to y
corr_df = X_poly.join(y).corr()[['wnvpresent']].abs().sort_values(by='wnvpresent', ascending=False)

# Instantiate result list
results = []

#create a function to gridsearch all models

def model(clf, clf_params, cutoff):
    
    # Feature list with correlation to y > cutoff value
    features = corr_df[corr_df['wnvpresent'] > cutoff].index[1:]
    
    # Instantiate pipeline
    pipe = Pipeline([
        ('ss', StandardScaler()),
        ('clf', clf)
    ])
    
    # Gridsearch for best estimator
    grid = GridSearchCV(
        pipe,
        param_grid=clf_params,
        scoring='roc_auc',
        verbose=2,
        n_jobs=-1
    )

    grid.fit(X_train[features], y_train)
    
    print(f'Classifier: {clf}, Cutoff value: {cutoff}')
    print('Best Parameters:')
    print(grid.best_params_)
    
    # Scoring metrics
    scores = {'Classifier': clf, 'Cutoff': cutoff}
    y_preds = grid.predict(X_test[features])
    y_pred_probas = grid.predict_proba(X_test[features])[:, 1]
    y_train_pred_probas = grid.predict_proba(X_train[features])[:, 1]
    scores['Train ROC-AUC Score'] = metrics.roc_auc_score(y_train, y_train_pred_probas)
    scores['Test ROC-AUC Score'] = metrics.roc_auc_score(y_test, y_pred_probas)
    scores['F1'] = metrics.f1_score(y_test, y_preds)
    scores['Precision'] = metrics.precision_score(y_test, y_preds)
    scores['Recall'] = metrics.recall_score(y_test, y_preds)
    scores['Accuracy'] = metrics.accuracy_score(y_test, y_preds)
    
    # Storing results
    results.append(scores)
    
    return pipe

In [19]:
#create list of classifiers and params to fit into function

classifiers = [
    (logreg, logreg_params),
    (gb, gb_params),
    (svc, svc_params),
    (xgb, xgb_params),
    (rf, rf_params),
    (et, et_params),
]

In [20]:
#iterate function through each cutoff point

for cutoff in [0, 0.01, 0.05, 0.1]:
    for (clf, clf_params) in classifiers:
        model(clf, clf_params, cutoff)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
Classifier: LogisticRegression(max_iter=1000, random_state=42, solver='liblinear'), Cutoff value: 0
Best Parameters:
{'clf__C': 1.5, 'clf__penalty': 'l1'}
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Classifier: GradientBoostingClassifier(random_state=42), Cutoff value: 0
Best Parameters:
{'clf__learning_rate': 0.1, 'clf__max_depth': 2}
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Classifier: SVC(probability=True, random_state=42), Cutoff value: 0
Best Parameters:
{'clf__C': 5, 'clf__kernel': 'linear'}
Fitting 5 folds for each of 27 candidates, totalling 135 fits
Classifier: XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, eval_metric='auc',
              gamma=None, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=None,
              max_delta_step=None, max_depth=None, m

  _warn_prf(average, modifier, msg_start, len(result))


Classifier: GradientBoostingClassifier(random_state=42), Cutoff value: 0.1
Best Parameters:
{'clf__learning_rate': 0.1, 'clf__max_depth': 3}
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Classifier: SVC(probability=True, random_state=42), Cutoff value: 0.1
Best Parameters:
{'clf__C': 5, 'clf__kernel': 'rbf'}


  _warn_prf(average, modifier, msg_start, len(result))


Fitting 5 folds for each of 27 candidates, totalling 135 fits
Classifier: XGBClassifier(base_score=None, booster=None, colsample_bylevel=None,
              colsample_bynode=None, colsample_bytree=None, eval_metric='auc',
              gamma=None, gpu_id=None, importance_type='gain',
              interaction_constraints=None, learning_rate=None,
              max_delta_step=None, max_depth=None, min_child_weight=None,
              missing=nan, monotone_constraints=None, n_estimators=100,
              n_jobs=None, num_parallel_tree=None, random_state=42,
              reg_alpha=None, reg_lambda=None, scale_pos_weight=None,
              subsample=None, tree_method=None, use_label_encoder=False,
              validate_parameters=None, verbosity=None), Cutoff value: 0.1
Best Parameters:
{'clf__eta': 0.15, 'clf__gamma': 0.2, 'clf__max_depth': 3}
Fitting 5 folds for each of 18 candidates, totalling 90 fits
Classifier: RandomForestClassifier(class_weight='balanced', random_state=42), Cuto

In [21]:
#display results by Test ROC-AUC score

pd.DataFrame(results).sort_values(by='Test ROC-AUC Score', ascending=False).reset_index(drop=True)

Unnamed: 0,Classifier,Cutoff,Train ROC-AUC Score,Test ROC-AUC Score,F1,Precision,Recall,Accuracy
0,"XGBClassifier(base_score=None, booster=None, c...",0.01,0.93952,0.85659,0.121622,0.818182,0.065693,0.948879
1,GradientBoostingClassifier(random_state=42),0.05,0.884891,0.855815,0.040541,0.272727,0.021898,0.94416
2,"LogisticRegression(max_iter=1000, random_state...",0.01,0.880634,0.854353,0.041379,0.375,0.021898,0.94534
3,"LogisticRegression(max_iter=1000, random_state...",0.0,0.878991,0.854213,0.041667,0.428571,0.021898,0.945733
4,GradientBoostingClassifier(random_state=42),0.01,0.889833,0.852942,0.054054,0.363636,0.029197,0.944947
5,GradientBoostingClassifier(random_state=42),0.0,0.891547,0.851727,0.066667,0.384615,0.036496,0.944947
6,"XGBClassifier(base_score=None, booster=None, c...",0.0,0.939217,0.851315,0.123288,1.0,0.065693,0.949666
7,"XGBClassifier(base_score=None, booster=None, c...",0.05,0.935214,0.850708,0.122449,0.9,0.065693,0.949273
8,"LogisticRegression(max_iter=1000, random_state...",0.05,0.875913,0.84935,0.013986,0.166667,0.007299,0.944554
9,"XGBClassifier(base_score=None, booster=None, c...",0.1,0.92868,0.845004,0.144737,0.733333,0.080292,0.948879


We see that in terms of test ROC-AUC score, the best model is xgboost classifier at a correlation cutoff of 0.01 with the following best parameters:
{'clf__eta': 0.15, 'clf__gamma': 0.0, 'clf__max_depth': 3}. 

The next best model in terms of test ROC-AUC score would be gradient boosting classifier at a correlation cutoff of 0.05 with the following best parameters:
{'clf__learning_rate': 0.1, 'clf__max_depth': 2}.

We will now proceed to save our top 2 models to file in case we need to use them in our evaluation later. 

### Saving top 2 best models from feature selection using correlation

In [34]:
best_xgb_model = XGBClassifier(use_label_encoder=False, objective='binary:logistic', eval_metric='auc', random_state=42, eta=0.15, gamma=0.0, max_depth=3)

In [37]:
# Save the model to file

Pkl_Filename = "best_xgb_model_corr.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_xgb_model, file)

In [38]:
best_gb_model = GradientBoostingClassifier(learning_rate=0.1, max_depth=2, random_state=42)

In [39]:
# Save the model to file

Pkl_Filename = "best_gb_model_corr.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(best_gb_model, file)