## Modeling

In [1]:
import pandas as pd
import numpy as np
from sklearn import ensemble, preprocessing
import datetime as dt
from sklearn import model_selection
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score,confusion_matrix
%matplotlib inline

In [2]:
path = '../train_test_weather/train_merged.csv'
merged = pd.read_csv(path)

In [3]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 43 columns):
Unnamed: 0         10506 non-null int64
date               10506 non-null object
species            10506 non-null int64
block              10506 non-null int64
street             10506 non-null int64
trap               10506 non-null int64
latitude           10506 non-null float64
longitude          10506 non-null float64
addressaccuracy    10506 non-null int64
wnvpresent         10506 non-null int64
year               10506 non-null int64
month              10506 non-null int64
day                10506 non-null int64
tmax_x             10506 non-null int64
tmin_x             10506 non-null int64
tavg_x             10506 non-null float64
dewpoint_x         10506 non-null int64
wetbulb_x          10506 non-null float64
heat_x             10506 non-null float64
cool_x             10506 non-null float64
sunrise_x          10506 non-null object
sunset_x           10506 non-null obj

#### Scale the Data

In [4]:
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

features =merged.columns.drop(['date','trap','wnvpresent','sunrise_x','sunrise_y','sunset_x','sunset_y'])

# create a new df for each scaling procedure
merged_mm =  pd.DataFrame(MinMaxScaler().fit_transform(merged[features]),columns=features)
merged_r =  pd.DataFrame(StandardScaler().fit_transform(merged[features]),columns=features)
merged_s =  pd.DataFrame(RobustScaler().fit_transform(merged[features]),columns=features)

In [5]:
merged_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 36 columns):
Unnamed: 0         10506 non-null float64
species            10506 non-null float64
block              10506 non-null float64
street             10506 non-null float64
latitude           10506 non-null float64
longitude          10506 non-null float64
addressaccuracy    10506 non-null float64
year               10506 non-null float64
month              10506 non-null float64
day                10506 non-null float64
tmax_x             10506 non-null float64
tmin_x             10506 non-null float64
tavg_x             10506 non-null float64
dewpoint_x         10506 non-null float64
wetbulb_x          10506 non-null float64
heat_x             10506 non-null float64
cool_x             10506 non-null float64
preciptotal_x      10506 non-null float64
stnpressure_x      10506 non-null float64
sealevel_x         10506 non-null float64
resultspeed_x      10506 non-null float64
resultdir

#### Train-Test Split with SMOTE

In [6]:
X = merged_r[features]
y = merged.wnvpresent

In [7]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.3, stratify=y)

In [9]:
print(X_train.shape, X_test.shape, y_test.shape, y_train.shape)

(7354, 36) (3152, 36) (3152,) (7354,)


In [10]:
y_train.value_counts()

0    6968
1     386
Name: wnvpresent, dtype: int64

In [11]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

In [12]:
y_train.value_counts()

1    6968
0    6968
Name: wnvpresent, dtype: int64

In [13]:
def score_model(model,X_test,y_test):
    preds = model.predict_proba(X_test)
    pred_list =[]
    
    for x in preds:
        pred_list.append(x[1])
        
    roc_score = roc_auc_score(y_test, pred_list)
    return roc_score

### 1. Random Forest

In [14]:
from sklearn.ensemble import RandomForestClassifier
#from sklearn.model_selection import GridSearchCV

In [15]:
#rfc=RandomForestClassifier(random_state=42)

In [16]:
#param_grid = { 
#    'n_estimators': [100,300,500],
#    'max_features': ['auto', 'none'],
#    'max_depth' : [4,5,6,7,8],
#    'criterion' :['gini', 'entropy']
#}

In [17]:
#CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
#CV_rfc.fit(X_train, y_train) bootstrap=True,min_samples_leaf= 1, min_samples_split= 2,

In [18]:
#CV_rfc.best_params_

In [19]:
RF = RandomForestClassifier(n_estimators = 500, max_depth=8,max_features='auto',criterion='gini')

In [20]:
RF_model= RF.fit(X_train, y_train)

In [21]:
preds=RF_model.predict(X_test)

In [22]:
accuracy_score(y_test,preds)

0.7848984771573604

In [23]:
score_model(RF_model,X_test,y_test)

0.8376266853334144

In [24]:
tn, fp, fn, tp = confusion_matrix(y_test,
                                  preds).ravel()
cm=confusion_matrix(y_test,preds)
cm_df=pd.DataFrame(cm,columns=['p_wnv','p_nownv'],
                   index=['a_wnv','a_nownv'])
cm_df

Unnamed: 0,p_wnv,p_nownv
a_wnv,2355,632
a_nownv,46,119


### 2.Support Vector Machine

In [25]:
from sklearn.svm import SVC

In [26]:
svmc= SVC(probability=True)

In [27]:
svm_model = svmc.fit(X_train,y_train)

In [28]:
score_model(svm_model,X_train,y_train)

0.9339918251023872

In [29]:
preds=svm_model.predict(X_test)

In [30]:
accuracy_score(y_test, preds)

0.7652284263959391

### 3.Logitic Regression

In [31]:
from sklearn.linear_model import LogisticRegression

In [32]:
y_train.value_counts()

1    6968
0    6968
Name: wnvpresent, dtype: int64

In [33]:
smt = SMOTE()
X_train, y_train = smt.fit_sample(X_train, y_train)

In [34]:
y_train.value_counts()

1    6968
0    6968
Name: wnvpresent, dtype: int64

In [35]:
logmodel = LogisticRegression()
logmodel.fit(X_train, y_train)
 
predictions = logmodel.predict(X_test)
print(classification_report(y_test, predictions))
print(confusion_matrix(y_test, predictions))
print(accuracy_score(y_test, predictions))

              precision    recall  f1-score   support

           0       0.99      0.68      0.81      2987
           1       0.13      0.82      0.22       165

    accuracy                           0.69      3152
   macro avg       0.56      0.75      0.51      3152
weighted avg       0.94      0.69      0.78      3152

[[2043  944]
 [  29  136]]
0.6913071065989848


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html.
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### 4.ADA Boost

In [36]:
from sklearn.ensemble import AdaBoostClassifier

In [37]:
adaboost= AdaBoostClassifier()
ada_model=adaboost.fit(X,y)
score_model(ada_model,X_test,y_test)

0.8746243824248512

### 5.Gradient Boost

In [38]:
from sklearn.ensemble import GradientBoostingClassifier

def GradBoostClass(X_train,y_train,X_test,y_test):
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)
    clf.score(X_test, y_test)   
    score_model(clf,X_test,y_test)

GradBoostClass(X_train,y_train,X_test,y_test)

In [39]:
gradboost = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(X_train, y_train)
gradboost.score(X_test, y_test)

0.8534263959390863

### 6.XG Boost

In [40]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier  
import scipy.stats as st

model = XGBClassifier()
xgm = model.fit(X_train, y_train, eval_metric=roc_auc_score)
score_model(xgm,X_test,y_test)

0.8397104625092573

## Import and Transform Test Data

In [41]:
path = '../train_test_weather/test_merged.csv'

In [42]:
test_w = pd.read_csv(path)

In [43]:
test_X =  pd.DataFrame(StandardScaler().fit_transform(test_w[features]),columns=features)

In [44]:
test_X.shape

(116293, 36)

## Exporting to test

In [45]:
import time 
import math
def model_and_export(model, model_name):
    pred_list = []
#     predictions = ["%.1f" % (math.ceil(x[1] * 100) / 100) for x in model.predict_proba(test_X)]
    predictions =  model.predict_proba(test_X)
    for x in predictions:
        pred_list.append(x[1])
    indexes=np.arange(1, len(predictions)+1, 1)
    preds_df = pd.DataFrame(data=[indexes, pred_list]).T
    preds_df.columns =['Id','WnvPresent']
    preds_df['Id'] = preds_df.Id.astype(int)
    location = '../submissions/{}_{}.csv'.format(model_name, time.strftime("%d_%m_%Y"))
    preds_df.to_csv(location, index=False)
    return

In [46]:
model_and_export(RF_model, 'RF_test')

In [47]:
model_and_export(svm_model, 'SVM')

In [48]:
model_and_export(logmodel, 'LR')
#0.74611, best score from kaggle test

In [49]:
model_and_export(ada_model, 'ADABoost_JD*')

In [50]:
model_and_export(gradboost, 'GRADBOOST_*')

In [51]:
model_and_export(xgm, 'XG_JD*')