In [36]:
import pandas as pd
import numpy as np
import seaborn as sns
import statistics
import matplotlib.pyplot as plt
import traceback
import re
import pandas.core.algorithms as algos
from pandas import Series
from datetime import datetime
from scipy import stats
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = pd.read_csv('FinalWNVData v.1')

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 69 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              10506 non-null  int64  
 1   Date                    10506 non-null  object 
 2   Address                 10506 non-null  object 
 3   Species                 10506 non-null  object 
 4   Block                   10506 non-null  int64  
 5   Street                  10506 non-null  object 
 6   Trap                    10506 non-null  object 
 7   AddressNumberAndStreet  10506 non-null  object 
 8   Latitude                10506 non-null  float64
 9   Longitude               10506 non-null  float64
 10  AddressAccuracy         10506 non-null  int64  
 11  NumMosquitos            10506 non-null  int64  
 12  WnvPresent              10506 non-null  int64  
 13  Tmax                    10506 non-null  float64
 14  Tmin                    10506 non-null

# Weight of Evidence/Information Value

In [5]:
#First let's split our data into training and testing sets.
X = df.drop(columns=['WnvPresent'])
y = df['WnvPresent']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

In [22]:
max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
            
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = algos.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    print(np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    for i in x:
        print(i)
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [23]:
final_iv, IV = data_vars(X_train,y_train)

Unnamed: 0
0   -0.145444
1    0.128637
dtype: float64
Date
Address
Species
Block
0   -0.010610
1    0.010781
dtype: float64
Street
Trap
AddressNumberAndStreet
Latitude
0   -0.172017
1    0.150968
dtype: float64
Longitude
0    0.171855
1   -0.207580
dtype: float64
AddressAccuracy
0   -0.030495
1    0.047348
dtype: float64
NumMosquitos
0   -2.019922
1   -0.819896
2    0.207651
3    0.969849
dtype: float64
Tmax
0    -1.070241
1         -inf
2    -0.277010
3     0.534074
4     0.866835
5     0.052734
6    -0.271733
7    -0.848649
8    -0.499696
9     0.387525
10         NaN
11    0.273494
12   -2.422580
13    0.616968
14   -0.304723
15    0.498741
dtype: float64
Tmin
0   -0.494275
1    0.372340
dtype: float64
Tavg
0   -0.092108
1    0.095455
dtype: float64
Depart
0   -0.317327
1    0.018193
2    0.281099
dtype: float64
DewPoint
0   -1.001869
1   -0.237601
2   -0.087141
3    0.249352
4    0.332664
5    0.341030
dtype: float64
WetBulb
0   -0.328114
1    0.271126
2        -inf
dtype: float64




Thursday
Tuesday
Wednesday
August
July
June
May
October
September
TimeSinceLastBR
0    0.035126
1    0.013693
2        -inf
dtype: float64
TimeSinceLastDZ
0    0.411571
1   -0.302471
2   -0.079292
dtype: float64
TimeSinceLastFG
0    0.253389
1    0.556856
2   -0.435779
dtype: float64
TimeSinceLastHZ
0    0.180928
1   -0.076325
2        -inf
dtype: float64
TimeSinceLastRA
0   -0.036068
1    0.099028
2        -inf
dtype: float64
TimeSinceLastTS
0    0.168506
1    0.021839
2   -2.110012
dtype: float64
TimeSinceLastTSRA
0   -0.055207
1    0.158735
2        -inf
dtype: float64
TimeSinceLastVCTS
0    0.105830
1   -0.114657
2   -0.007367
dtype: float64
Hyde Park Township
Jefferson Township
Lake Township
Lake View Township
Leyden Township
Norwood Park Township
Rogers Park Township
Stickney Township
Thornton Township
Worth Township


In [32]:
IV.sort_values(by='IV',ascending=False)

Unnamed: 0,VAR_NAME,IV
43,Sunset,1.130986
29,NumMosquitos,1.080304
10,Date,0.881463
42,Sunrise,0.870534
20,June,0.658177
...,...,...
26,May,0.000077
39,Stickney Township,0.000040
22,Lake View Township,0.000014
15,HZ,0.000012


In [33]:
features = list(IV[(IV['IV'] >= 0.01) & (IV['IV'] <= 0.8)]['VAR_NAME'])
X2 = X_train[features]
display(X2.shape)
X2.head()

(7354, 39)

Unnamed: 0.1,Address,AddressNumberAndStreet,August,AvgSpeed,BR,CodeSum,DZ,Depart,DewPoint,FG,...,TimeSinceLastTS,TimeSinceLastTSRA,Tmax,Tmin,Trap,Unnamed: 0,VCTS,Wednesday,WetBulb,Worth Township
9134,"4000 East 130th Street, Chicago, IL 60633, USA","4000 E 130TH ST, Chicago, IL",0,4.5,0,,0,-5.0,53.0,0,...,6.0,17.0,81.0,57.5,T221,9134,0,0,60.5,0
3096,"2100 North Lawler Avenue, Chicago, IL 60639, USA","2100 N LAWLER AVE, Chicago, IL",0,5.8,0,,0,-8.0,42.5,0,...,19.0,6.0,66.5,46.0,T154,3096,0,1,49.5,0
136,"4000 North Tripp Avenue, Chicago, IL 60641, USA","4000 N TRIPP AVE, Chicago, IL",0,7.6,1,TSRA BR HZ VCTS,0,10.0,69.0,0,...,28.0,0.0,91.5,71.5,T001,136,1,0,72.0,0
3551,"South Doty Avenue, Chicago, IL, USA","1200 S DOTY AVE, Chicago, IL",0,11.2,0,,0,18.0,62.5,0,...,31.0,18.0,90.5,69.5,T115,3551,0,0,68.5,0
3805,"East 91st Street, Chicago, IL, USA","8100 E 91ST ST, Chicago, IL",0,13.05,0,,0,8.0,44.0,0,...,14.0,14.0,76.5,49.5,T107,3805,0,0,53.0,0


In [47]:
IV[IV['VAR_NAME']=='Time']

Unnamed: 0,VAR_NAME,IV
49,Time,0.018906


In [41]:
X2.isna().sum()

Address                      0
AddressNumberAndStreet       0
August                       0
AvgSpeed                     0
BR                           0
CodeSum                      0
DZ                           0
Depart                       0
DewPoint                     0
FG                           0
Friday                       0
Heat                         0
Jefferson Township           0
July                         0
June                         0
Latitude                     0
Longitude                    0
Monday                       0
October                      0
PrecipTotal                  0
ResultSpeed                  0
Species                      0
Street                       0
TS                           0
Thursday                     0
Time                      7275
TimeSinceLastDZ           4582
TimeSinceLastFG           4271
TimeSinceLastHZ            429
TimeSinceLastTS            764
TimeSinceLastTSRA          372
Tmax                         0
Tmin    

In [37]:
def iterate_vif(df, vif_threshold=5, max_vif=6):
    count = 0
    while max_vif > vif_threshold:
        count += 1
        print("Iteration # "+str(count))
        vif = pd.DataFrame()
        vif["VIFactor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
        vif["features"] = df.columns
        
        if vif['VIFactor'].max() > vif_threshold:
            print('Removing %s with VIF of %f' % (vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], vif['VIFactor'].max()))
            df = df.drop(vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], axis=1)
            max_vif = vif['VIFactor'].max()
        else:
            print('Complete')
            return df, vif.sort_values('VIFactor')  

X1 = X2._get_numeric_data()
final_df, final_vif = iterate_vif(X1)

Iteration # 1


MissingDataError: exog contains inf or nans