In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

# local
from logistic_regression import LogReg
from naive_bayes import GaussNB
from sarima import SARIMA
from svm import SuppVectMach

In [2]:
# File paths

dis_path = './data/us_disaster_declarations.csv'
temp_path = './data/GlobalLandTemperaturesByState.csv'
states_path = './data/states.csv'

In [3]:
# Create state name to abbreviation mapping

states = {}
with open(states_path) as f:
    next(f)

    for line in f:
        l = line.split(',')
        states[l[0].strip()] = l[1].strip()

In [4]:
# Preprocess disaster data

dis_data = pd.read_csv(dis_path)[['state', 'declaration_date', 'incident_type']].rename({'declaration_date': 'date'}, axis=1)
dis_data['date'] = dis_data['date'].astype('datetime64[ns]').dt.strftime('%Y-%m')
dis_data = dis_data.groupby(['state', 'date']).count()
dis_data = dis_data.rename({'incident_type': 'disaster_occurrence'}, axis=1)
dis_data['disaster_occurrence'] = np.ones(dis_data['disaster_occurrence'].shape)
dis_data.reset_index(inplace=True)

print(dis_data)

dis_data.to_csv('./data/test_disasters_state_month.csv', index=False)

     state     date  disaster_occurrence
0       AK  1953-10                  1.0
1       AK  1954-11                  1.0
2       AK  1955-12                  1.0
3       AK  1964-03                  1.0
4       AK  1967-08                  1.0
...    ...      ...                  ...
3415    WY  2018-06                  1.0
3416    WY  2018-09                  1.0
3417    WY  2020-03                  1.0
3418    WY  2020-04                  1.0
3419    WY  2020-09                  1.0

[3420 rows x 3 columns]


In [5]:
# Preprocess temperature data

temp_data = pd.read_csv(temp_path)
temp_data = temp_data[temp_data['Country'] == 'United States'].dropna()
temp_data['date'] = temp_data['dt'].astype('datetime64[ns]').dt.strftime('%Y-%m')
temp_data['state'] = temp_data['State'].apply(lambda x: states[x] if x in states else None)
temp_data = temp_data.dropna().rename({'AverageTemperature': 'ave_temp', 'AverageTemperatureUncertainty': 'ave_temp_uncertainty'}, axis=1)
temp_data = temp_data.groupby(['date', 'state'])
temp_data = temp_data[['ave_temp', 'ave_temp_uncertainty']].mean().reset_index()

print(temp_data)

temp_data.to_csv('./data/test_temp_state_month.csv', index=False)

           date state  ave_temp  ave_temp_uncertainty
0       1743-11    AL    10.722                 2.898
1       1743-11    CT     3.818                 1.727
2       1743-11    DE     6.127                 2.050
3       1743-11    FL    17.694                 2.514
4       1743-11    IA     1.250                 3.802
...         ...   ...       ...                   ...
135687  2013-09    VT    14.312                 1.224
135688  2013-09    WA    16.823                 1.128
135689  2013-09    WI    16.567                 0.982
135690  2013-09    WV    18.708                 0.909
135691  2013-09    WY    15.811                 1.101

[135692 rows x 4 columns]


In [6]:
# Join disaster and temperature datasets

df = pd.merge(temp_data, dis_data, on=['date', 'state'], how='left').set_index(['date', 'state'], drop=True)
df = df.fillna(0).reset_index()
df['month'] = df['date'].astype('datetime64[ns]').dt.strftime('%m')

print(df)

df.to_csv('./data/test_disasters_temp_state_month.csv', index=False)

           date state  ave_temp  ave_temp_uncertainty  disaster_occurrence  \
0       1743-11    AL    10.722                 2.898                  0.0   
1       1743-11    CT     3.818                 1.727                  0.0   
2       1743-11    DE     6.127                 2.050                  0.0   
3       1743-11    FL    17.694                 2.514                  0.0   
4       1743-11    IA     1.250                 3.802                  0.0   
...         ...   ...       ...                   ...                  ...   
135687  2013-09    VT    14.312                 1.224                  0.0   
135688  2013-09    WA    16.823                 1.128                  0.0   
135689  2013-09    WI    16.567                 0.982                  0.0   
135690  2013-09    WV    18.708                 0.909                  0.0   
135691  2013-09    WY    15.811                 1.101                  0.0   

       month  
0         11  
1         11  
2         11  
3  

In [7]:
data_fp = './data/test_disasters_temp_state_month.csv'

In [8]:
# Gaussian Naive Bayes

gnb = GaussNB()
gnb.load_data(data_fp)
gnb.train()

roc_gnb = []
threshold = 0

for i in range(0, 10):
    gnb.eval(threshold + i * 0.005)

    tn = gnb.confusion['tn']
    fp = gnb.confusion['fp']
    fn = gnb.confusion['fn']
    tp = gnb.confusion['tp']

    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    roc_gnb.append([threshold, tn, fp, fn, tp, fpr, tpr, precision, recall])

roc_gnb_df = pd.DataFrame(roc_gnb, columns=['Threshold', 'tn', 'fp', 'fn', 'tp','fpr', 'tpr', 'precision', 'recall'])
print(roc_gnb_df)
roc_gnb_df.to_csv('./output/roc_gnb_df.csv', index=False)

  return f(*args, **kwargs)


   Threshold     tn     fp   fn   tp       fpr       tpr  precision    recall
0          0      0  33300    0  623  1.000000  1.000000   0.018365  1.000000
1          0    721  32579   15  608  0.978348  0.975923   0.018320  0.975923
2          0   4860  28440   49  574  0.854054  0.921348   0.019784  0.921348
3          0  10506  22794  104  519  0.684505  0.833066   0.022262  0.833066
4          0  16169  17131  186  437  0.514444  0.701445   0.024875  0.701445
5          0  22063  11237  297  326  0.337447  0.523274   0.028193  0.523274
6          0  30822   2478  543   80  0.074414  0.128411   0.031274  0.128411
7          0  33300      0  623    0  0.000000  0.000000        NaN  0.000000
8          0  33300      0  623    0  0.000000  0.000000        NaN  0.000000
9          0  33300      0  623    0  0.000000  0.000000        NaN  0.000000




In [9]:
# Support Vector Machine

svc = SuppVectMach()
svc.load_data(data_fp)
svc.train()

roc_svc = []
threshold = 0

for i in range(0, 5):
    svc.eval(threshold + i * 0.005)

    tn = svc.confusion['tn']
    fp = svc.confusion['fp']
    fn = svc.confusion['fn']
    tp = svc.confusion['tp']

    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    roc_svc.append([threshold, tn, fp, fn, tp, fpr, tpr, precision, recall])

roc_svc_df = pd.DataFrame(roc_svc, columns=['Threshold', 'tn', 'fp', 'fn', 'tp','fpr', 'tpr', 'precision', 'recall'])
print(roc_svc_df)
roc_svc_df.to_csv('./output/roc_svc_df.csv', index=False)

  return f(*args, **kwargs)


   Threshold     tn     fp   fn   tp      fpr       tpr  precision    recall
0          0      0  33300    0  623  1.00000  1.000000   0.018365  1.000000
1          0      0  33300    0  623  1.00000  1.000000   0.018365  1.000000
2          0      0  33300    0  623  1.00000  1.000000   0.018365  1.000000
3          0      0  33300    0  623  1.00000  1.000000   0.018365  1.000000
4          0  20994  12306  378  245  0.36955  0.393258   0.019520  0.393258


In [10]:
# Logistic Regression

lr = LogReg()
lr.load_data(data_fp)
lr.train()

roc_lr = []
threshold = 0

for i in range(0, 10):
    lr.eval(threshold + i * 0.005)

    tn = lr.confusion['tn']
    fp = lr.confusion['fp']
    fn = lr.confusion['fn']
    tp = lr.confusion['tp']

    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    roc_lr.append([threshold, tn, fp, fn, tp, fpr, tpr, precision, recall])

roc_lr_df = pd.DataFrame(roc_lr, columns=['Threshold', 'tn', 'fp', 'fn', 'tp','fpr', 'tpr', 'precision', 'recall'])
print(roc_lr_df)
roc_lr_df.to_csv('./output/roc_lr_df.csv', index=False)

  return f(*args, **kwargs)


   Threshold     tn     fp   fn   tp       fpr       tpr  precision    recall
0          0      0  33300    0  623  1.000000  1.000000   0.018365  1.000000
1          0     25  33275    1  622  0.999249  0.998395   0.018350  0.998395
2          0   3016  30284   35  588  0.909429  0.943820   0.019046  0.943820
3          0  11424  21876  116  507  0.656937  0.813804   0.022651  0.813804
4          0  18510  14790  230  393  0.444144  0.630819   0.025884  0.630819
5          0  24571   8729  358  265  0.262132  0.425361   0.029464  0.425361
6          0  29859   3441  515  108  0.103333  0.173355   0.030431  0.173355
7          0  32772    528  604   19  0.015856  0.030498   0.034735  0.030498
8          0  33295      5  621    2  0.000150  0.003210   0.285714  0.003210
9          0  33300      0  623    0  0.000000  0.000000        NaN  0.000000




In [11]:
# SARIMA

out_df = None
conf_df = pd.DataFrame()

states = SARIMA().load_data(data_fp, return_df=True)['state'].unique()

for state in states:
    sarima = SARIMA()
    model_df = sarima.load_data(data_fp, return_df=True)
    model_df = model_df[model_df['state'] == state].sort_values('date').reset_index(drop=True)

    sarima.train(model_df)
    sarima.eval()

    sarima.confusion['state'] = state
    conf_df = conf_df.append(sarima.confusion, ignore_index=True)
    if out_df is None:
        out_df = model_df
    else:
        out_df = out_df.append(model_df)

print(conf_df)

out_df.to_csv('./output/arima_state_month.csv', index=False)
conf_df.to_csv('./output/arima_state_month_confusion.csv', index=False)

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting seasonal autoregressive'

  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'
  warn('Non-stationary starting seasonal autoregressive'


       fn     fp state      tn    tp
0    56.0  413.0    AL  2641.0   9.0
1    22.0  390.0    CT  2700.0   7.0
2    15.0  390.0    DE  2710.0   4.0
3    64.0  413.0    FL  2604.0  12.0
4    46.0  402.0    IA  2688.0   5.0
5    52.0  443.0    IL  2640.0   6.0
6    42.0  403.0    IN  2670.0   4.0
7    62.0    8.0    KY  3049.0   0.0
8    33.0  403.0    MA  2672.0  11.0
9    24.0  413.0    MD  2678.0   4.0
10   44.0  367.0    ME  2701.0   7.0
11   29.0  405.0    MI  2703.0   4.0
12   45.0  383.0    MN  2703.0  10.0
13   55.0  445.0    MO  2613.0   6.0
14   44.0  381.0    NC  2690.0   4.0
15   36.0  367.0    NH  2708.0   8.0
16   37.0  393.0    NJ  2681.0   8.0
17   73.0  376.0    NY  2660.0  10.0
18   48.0  450.0    OH  2615.0   6.0
19   45.0  408.0    PA  2662.0   4.0
20   14.0  383.0    RI  2717.0   5.0
21   14.0  388.0    SC  2712.0   5.0
22   47.0  398.0    TN  2668.0   6.0
23   44.0  391.0    VA  2675.0   9.0
24   33.0  384.0    VT  2696.0   6.0
25   40.0  406.0    WI  2692.0   3.0
2