In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import statistics
import matplotlib.pyplot as plt
import traceback
import re
import pandas.core.algorithms as algos
from pandas import Series
from datetime import datetime
from scipy import stats
from sklearn.model_selection import train_test_split
from statsmodels.stats.outliers_influence import variance_inflation_factor

df = pd.read_csv('FinalWNVData v.1')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10506 entries, 0 to 10505
Data columns (total 69 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Unnamed: 0              10506 non-null  int64  
 1   Date                    10506 non-null  object 
 2   Address                 10506 non-null  object 
 3   Species                 10506 non-null  object 
 4   Block                   10506 non-null  int64  
 5   Street                  10506 non-null  object 
 6   Trap                    10506 non-null  object 
 7   AddressNumberAndStreet  10506 non-null  object 
 8   Latitude                10506 non-null  float64
 9   Longitude               10506 non-null  float64
 10  AddressAccuracy         10506 non-null  int64  
 11  NumMosquitos            10506 non-null  int64  
 12  WnvPresent              10506 non-null  int64  
 13  Tmax                    10506 non-null  float64
 14  Tmin                    10506 non-null

# Weight of Evidence/Information Value

In [21]:
df.isna().sum()

Unnamed: 0               0
Date                     0
Address                  0
Species                  0
Block                    0
                        ..
Norwood Park Township    0
Rogers Park Township     0
Stickney Township        0
Thornton Township        0
Worth Township           0
Length: 69, dtype: int64

In [4]:
#First let's split our data into training and testing sets.
X = df.drop(columns=['WnvPresent'])
y = df['WnvPresent']
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3,random_state = 42)

In [7]:
max_bin = 20
force_bin = 3

# define a binning function
def mono_bin(Y, X, n = max_bin):    
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]
    r = 0
    while np.abs(r) < 1:
        try:
            d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.qcut(notmiss.X, n)})
            d2 = d1.groupby('Bucket', as_index=True)
            r, p = stats.spearmanr(d2.mean().X, d2.mean().Y)
            n = n - 1 
            
        except Exception as e:
            n = n - 1

    if len(d2) == 1:
        n = force_bin         
        bins = np.quantile(notmiss.X, np.linspace(0, 1, n))
        if len(np.unique(bins)) == 2:
            bins = np.insert(bins, 0, 1)
            bins[1] = bins[1]-(bins[1]/2)
        d1 = pd.DataFrame({"X": notmiss.X, "Y": notmiss.Y, "Bucket": pd.cut(notmiss.X, np.unique(bins),include_lowest=True)}) 
        d2 = d1.groupby('Bucket', as_index=True)
    
    d3 = pd.DataFrame({},index=[])
    d3["MIN_VALUE"] = d2.min().X
    d3["MAX_VALUE"] = d2.max().X
    d3["COUNT"] = d2.count().Y
    d3["EVENT"] = d2.sum().Y
    d3["NONEVENT"] = d2.count().Y - d2.sum().Y
    d3=d3.reset_index(drop=True)
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    print(np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT))
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]       
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    
    return(d3)

def char_bin(Y, X):
        
    df1 = pd.DataFrame({"X": X, "Y": Y})
    justmiss = df1[['X','Y']][df1.X.isnull()]
    notmiss = df1[['X','Y']][df1.X.notnull()]    
    df2 = notmiss.groupby('X',as_index=True)
    d3 = pd.DataFrame({},index=[])
    d3["COUNT"] = df2.count().Y
    d3["MIN_VALUE"] = df2.sum().Y.index
    d3["MAX_VALUE"] = d3["MIN_VALUE"]
    d3["EVENT"] = df2.sum().Y
    d3["NONEVENT"] = df2.count().Y - df2.sum().Y
    
    if len(justmiss.index) > 0:
        d4 = pd.DataFrame({'MIN_VALUE':np.nan},index=[0])
        d4["MAX_VALUE"] = np.nan
        d4["COUNT"] = justmiss.count().Y
        d4["EVENT"] = justmiss.sum().Y
        d4["NONEVENT"] = justmiss.count().Y - justmiss.sum().Y
        d3 = d3.append(d4,ignore_index=True)
    
    d3["EVENT_RATE"] = d3.EVENT/d3.COUNT
    d3["NON_EVENT_RATE"] = d3.NONEVENT/d3.COUNT
    d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
    d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
    d3["WOE"] = np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["IV"] = (d3.DIST_EVENT-d3.DIST_NON_EVENT)*np.log(d3.DIST_EVENT/d3.DIST_NON_EVENT)
    d3["VAR_NAME"] = "VAR"
    d3 = d3[['VAR_NAME','MIN_VALUE', 'MAX_VALUE', 'COUNT', 'EVENT', 'EVENT_RATE', 'NONEVENT', 'NON_EVENT_RATE', 'DIST_EVENT','DIST_NON_EVENT','WOE', 'IV']]      
    d3 = d3.replace([np.inf, -np.inf], 0)
    d3.IV = d3.IV.sum()
    d3 = d3.reset_index(drop=True)
    
    return(d3)

def data_vars(df1, target):
    
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]
    
    x = df1.dtypes.index
    count = -1
    for i in x:
        print(i)
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i            
                count = count + 1
                
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv,ignore_index=True)
    
    iv = pd.DataFrame({'IV':iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return(iv_df,iv)

In [8]:
final_iv, IV = data_vars(X_train,y_train)

Unnamed: 0
0   -0.149394
1    0.131744
dtype: float64
Date
Address
Species
Block
0   -0.017225
1    0.017073
dtype: float64
Street


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_

Trap
AddressNumberAndStreet
Latitude
0     0.230493
1    -1.257677
2    -0.035210
3    -0.035210
4     0.134824
5    -0.488407
6    -0.438769
7     0.225982
8    -0.058275
9    -0.808400
10   -0.364856
11   -0.969995
12   -0.195059
13   -0.080113
14    0.192707
15    0.114064
16    0.637201
17         NaN
18    0.271315
19    0.096962
dtype: float64
Longitude
0    0.192852
1   -0.233564
dtype: float64
AddressAccuracy
0    0.001668
1   -0.002725
dtype: float64
NumMosquitos
0   -2.097645
1   -0.890403
2    0.180770
3    1.006982
dtype: float64
Tmax


  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)


0   -0.005326
1    0.005612
dtype: float64
Tmin
0   -0.379902
1    0.287091
dtype: float64
Tavg
0   -0.093865
1    0.089392
dtype: float64
Depart
0   -0.342075
1    0.112600
2    0.186850
dtype: float64
DewPoint


  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)


0   -0.522542
1   -0.206358
2    0.494512
dtype: float64
WetBulb
0   -0.240654
1    0.213435
2        -inf
dtype: float64
Heat
0    0.116984
1   -2.076142
dtype: float64
Cool
0   -0.093865
1    0.089392
dtype: float64
Sunrise
Sunset
CodeSum
SnowFall


  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)


PrecipTotal
0    0.025189
1   -2.356290
dtype: float64
StnPressure
0    0.050265
1   -0.034340
2        -inf
dtype: float64
SeaLevel
0    0.051397
1   -0.062065
dtype: float64


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)


ResultSpeed
0    0.166230
1   -0.073762
2   -0.128787
dtype: float64
ResultDir
0   -0.059085
1    0.065904
dtype: float64
AvgSpeed


  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  d3["DIST_EVENT"] = d3.EVENT/d3.sum().EVENT
  d3["DIST_NON_EVENT"] = d3.NONEVENT/d3.sum().NONEVENT
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)


0    0.074931
1   -0.090975
dtype: float64
BR
DZ
FG
HZ
RA
TS
TSRA
VCTS
Time
Friday
Monday
Thursday
Tuesday
Wednesday
August
July


  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)


June
May
October
September
TimeSinceLastBR
0    0.022251
1    0.030198
2        -inf
dtype: float64
TimeSinceLastDZ
0    0.357762
1   -0.226685
2   -0.073044
dtype: float64
TimeSinceLastFG
0    0.252176
1    0.584022
2   -0.466688
dtype: float64


  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)


TimeSinceLastHZ
0    0.174568
1   -0.064401
2        -inf
dtype: float64
TimeSinceLastRA
0   -0.017248
1    0.067783
2        -inf
dtype: float64
TimeSinceLastTS
0    0.203201
1   -0.020947
2   -2.328418
dtype: float64
TimeSinceLastTSRA
0   -0.028670
1    0.138447
2        -inf
dtype: float64
TimeSinceLastVCTS
0    0.058096
1   -0.138638
2    0.036102
dtype: float64
Hyde Park Township
Jefferson Township
Lake Township
Lake View Township
Leyden Township
Norwood Park Township
Rogers Park Township
Stickney Township
Thornton Township
Worth Township


  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  d3 = d3.append(d4,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  iv_df = iv_df.append(conv,ignore_index=True)
  iv_df = iv_df.append(conv,ignore_index=True)


In [17]:
IV['IV'] = IV['IV'].apply(lambda x: '%.5f' % x)
IV.sort_values(by='IV',ascending=False)

Unnamed: 0,VAR_NAME,IV
29,NumMosquitos,1.16296
43,Sunset,0.86734
10,Date,0.86313
42,Sunrise,0.75105
3,August,0.46158
...,...,...
58,Tmax,0.00003
21,Lake Township,0.00001
47,Thornton Township,0.00000
1,AddressAccuracy,0.00000


In [19]:
IV[IV['VAR_NAME']=='Time']

Unnamed: 0,VAR_NAME,IV
49,Time,0.02213


In [10]:
features = list(IV[(IV['IV'] >= 0.01) & (IV['IV'] <= 0.8)]['VAR_NAME'])
X2 = X_train[features]
display(X2.shape)
X2.head()

(7354, 35)

Unnamed: 0.1,Address,AddressNumberAndStreet,August,BR,CodeSum,DZ,Depart,DewPoint,FG,Friday,...,Time,TimeSinceLastDZ,TimeSinceLastFG,TimeSinceLastHZ,TimeSinceLastTS,Tmin,Trap,Unnamed: 0,VCTS,WetBulb
4115,"1000 North Central Park Avenue, Chicago, IL 60...","1000 N CENTRAL PARK DR, Chicago, IL",0,0,RA,0,-6.0,49.0,0,1,...,,,,,,49.5,T030,4115,0,55.0
9004,"1700 West Addison Street, Chicago, IL 60613, USA","1700 W ADDISON ST, Chicago, IL",0,1,TS BR,0,10.0,69.5,0,1,...,,,,22.0,0.0,73.0,T224,9004,0,75.0
3334,"4700 South Cornell Avenue, Chicago, IL 60615, USA","4700 S CORNELL AVE, Chicago, IL",0,0,,0,12.0,59.0,0,0,...,,42.0,113.0,13.0,26.0,66.5,T075,3334,0,65.0
8333,"4200 West 127th Street, Alsip, IL 60803, USA","4200 W 127TH PL, Chicago, IL",0,1,RA BR,0,5.0,64.0,0,1,...,,,,,,69.0,T135,8333,0,67.5
9021,"1700 North Ashland Avenue, Chicago, IL 60622, USA","1700 N ASHLAND AVE, Chicago, IL",0,1,TS BR,0,10.0,69.5,0,1,...,,,,22.0,0.0,73.0,T232,9021,0,75.0


In [18]:
X2.isna().sum()

Address                      0
AddressNumberAndStreet       0
August                       0
BR                           0
CodeSum                      0
DZ                           0
Depart                       0
DewPoint                     0
FG                           0
Friday                       0
Heat                         0
Jefferson Township           0
July                         0
June                         0
Latitude                     0
Longitude                    0
Monday                       0
October                      0
PrecipTotal                  0
ResultSpeed                  0
Species                      0
Street                       0
Sunrise                      0
TS                           0
Thursday                     0
Time                      7274
TimeSinceLastDZ           4545
TimeSinceLastFG           4241
TimeSinceLastHZ            431
TimeSinceLastTS            747
Tmin                         0
Trap                         0
Unnamed:

In [20]:
X2

Unnamed: 0.1,Address,AddressNumberAndStreet,August,BR,CodeSum,DZ,Depart,DewPoint,FG,Friday,...,Time,TimeSinceLastDZ,TimeSinceLastFG,TimeSinceLastHZ,TimeSinceLastTS,Tmin,Trap,Unnamed: 0,VCTS,WetBulb
4115,"1000 North Central Park Avenue, Chicago, IL 60...","1000 N CENTRAL PARK DR, Chicago, IL",0,0,RA,0,-6.0,49.0,0,1,...,,,,,,49.5,T030,4115,0,55.0
9004,"1700 West Addison Street, Chicago, IL 60613, USA","1700 W ADDISON ST, Chicago, IL",0,1,TS BR,0,10.0,69.5,0,1,...,,,,22.0,0.0,73.0,T224,9004,0,75.0
3334,"4700 South Cornell Avenue, Chicago, IL 60615, USA","4700 S CORNELL AVE, Chicago, IL",0,0,,0,12.0,59.0,0,0,...,,42.0,113.0,13.0,26.0,66.5,T075,3334,0,65.0
8333,"4200 West 127th Street, Alsip, IL 60803, USA","4200 W 127TH PL, Chicago, IL",0,1,RA BR,0,5.0,64.0,0,1,...,,,,,,69.0,T135,8333,0,67.5
9021,"1700 North Ashland Avenue, Chicago, IL 60622, USA","1700 N ASHLAND AVE, Chicago, IL",0,1,TS BR,0,10.0,69.5,0,1,...,,,,22.0,0.0,73.0,T232,9021,0,75.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5734,"8200 South Kostner Avenue, Chicago, IL 60652, USA","8200 S KOSTNER AVE, Chicago, IL",0,0,,0,7.0,55.5,0,0,...,,,,66.0,87.0,58.5,T225,5734,0,61.5
5191,"8200 South Kostner Avenue, Chicago, IL 60652, USA","8200 S KOSTNER AVE, Chicago, IL",0,0,,0,-4.0,54.5,0,1,...,,,,21.0,42.0,60.0,T225,5191,0,61.5
5390,"1000 East 67th Street, Chicago, IL 60637, USA","1000 E 67TH ST, Chicago, IL",1,0,,0,0.0,55.0,0,0,...,,,,34.0,55.0,61.5,T073,5390,0,63.0
860,"1500 North Long Avenue, Chicago, IL 60651, USA","1500 N LONG AVE, Chicago, IL",1,0,HZ,0,8.0,62.5,0,0,...,,13.0,64.0,0.0,14.0,69.0,T153,860,0,69.5


In [None]:
#Since there are frankly too many missing values from the "Time" column that makes it very difficult

In [25]:
def iterate_vif(df, vif_threshold=5, max_vif=6):
    count = 0
    while max_vif > vif_threshold:
        count += 1
        print("Iteration # "+str(count))
        vif = pd.DataFrame()
        df = add_constant(df)
        vif["VIFactor"] = [variance_inflation_factor(df.values, i) for i in range(df.shape[1])]
        vif["features"] = df.columns
        
        if vif['VIFactor'].max() > vif_threshold:
            print('Removing %s with VIF of %f' % (vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], vif['VIFactor'].max()))
            df = df.drop(vif[vif['VIFactor'] == vif['VIFactor'].max()]['features'].values[0], axis=1)
            max_vif = vif['VIFactor'].max()
        else:
            print('Complete')
            return df, vif.sort_values('VIFactor')  

X1 = X2._get_numeric_data()
final_df, final_vif = iterate_vif(X1)

Iteration # 1


MissingDataError: exog contains inf or nans