In [19]:
import re

import numpy as np
import pandas as pd
import datetime as dt
import gc
import random
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer

def main():
    dis_path = './data/us_disaster_declarations.csv'
    temp_path = './data/GlobalLandTemperaturesByState.csv'
    states_path = './data/states.csv'

    # load list of states
    states = {}
    with open(states_path) as f:
        next(f)

        for line in f:
            l = line.split(',')
            states[l[0].strip()] = l[1].strip()

    # print(states)

    # filter disaster dataset
    dis_data = pd.read_csv(dis_path)[['state', 'declaration_date', 'incident_type','declaration_title']].rename({'declaration_date': 'date'}, axis=1)
    dis_data['date'] = dis_data['date'].astype('datetime64[ns]').dt.strftime('%m-%Y')
    dis_data = dis_data.drop_duplicates(subset=['incident_type', 'declaration_title', 'date', 'state'], keep='first')
    # dis_data = dis_data.groupby(['state', 'date']).count()
    dis_data['disaster_type'] = dis_data['incident_type']
    dis_data = dis_data.rename({'incident_type': 'disaster_occurrence'}, axis=1)
    dis_data['disaster_occurrence'] = np.ones(dis_data['disaster_occurrence'].shape)
    # dis_data = dis_data.reset_index()

    # print(dis_data)

    dis_data.to_csv('./data/test_disasters_state_month.csv', index=False)

    # filter temperature dataset
    temp_data = pd.read_csv(temp_path)
    temp_data = temp_data[temp_data['Country'] == 'United States'].dropna()  # filter by United States, remove NaNs
    temp_data['date'] = temp_data['dt'].astype('datetime64[ns]').dt.strftime('%m-%Y')  # convert string to date, then convert to year
    temp_data['state'] = temp_data['State'].apply(lambda x: states[x] if x in states else None)  # preprocess state strings
    temp_data = temp_data.dropna()
    temp_data = temp_data.groupby(['date', 'state'])  # group by year then state
    temp_data = temp_data[['AverageTemperature', 'AverageTemperatureUncertainty']].mean().reset_index()  # take average over groups

    # print(temp_data)

    temp_data.to_csv('./data/test_temp_state_month.csv', index=False)

    # join on `Year` and `State`
    df = pd.merge(temp_data, dis_data, on=['date', 'state'], how='left').set_index(['date', 'state'], drop=True)
    df.rename({'AverageTemperature': 'ave_temp', 'AverageTemperatureUncertainty': 'ave_temp_uncertainty'}, axis=1, inplace=True)
    df = df.fillna(0).reset_index()
    df['month'] = df['date'].astype('datetime64[ns]').dt.strftime('%m')
    
   

    df.to_csv('./data/test_disasters_temp_state_month.csv', index=False)
    
    df['y_data'] = df['disaster_occurrence']
    df = df.drop(['disaster_occurrence'], axis=1)
    print(df)
    
    x_data = df.iloc[:,0:-1]
    y_data = df.iloc[:,-1]
    x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=.7, random_state=614, shuffle=True)
    
    print(x_data)
    print(y_data)
    
    

if __name__ == '__main__':
    main()
    


           date state  ave_temp  ave_temp_uncertainty declaration_title  \
0       01-1745    AL     6.931                 2.838                 0   
1       01-1745    CT    -1.734                 1.543                 0   
2       01-1745    DE     1.175                 1.921                 0   
3       01-1745    FL    14.640                 2.447                 0   
4       01-1745    IA    -8.066                 3.766                 0   
...         ...   ...       ...                   ...               ...   
136388  12-2012    VT    -2.191                 0.343                 0   
136389  12-2012    WA     0.290                 0.307                 0   
136390  12-2012    WI    -3.367                 0.301                 0   
136391  12-2012    WV     4.280                 0.240                 0   
136392  12-2012    WY    -5.374                 0.375                 0   

       disaster_type month  y_data  
0                  0    01     0.0  
1                  0    0

# 1. Setting up a confusion matrix

In [None]:
#Load the data

import csv
import pandas as pd
import numpy as np
from sklearn import metrics


with open('data_test.csv', 'r') as file:
    df = csv.reader(file)
    for row in df:
        print(row)
        
df = pd.read_csv('data_test.csv')
        
#Create the confusion matrix
# Importing the dependancies

y_pred = df[df.columns[-1]]
# Actual values
y_act = df[df.columns[2]]

# Printing the confusion matrix
# The columns will show the instances predicted for each label,
# and the rows will show the actual number of instances for each label.

print(y_pred)
# Printing the precision and recall, among other metrics

print(y_act)
#confusion matrix
print(metrics.confusion_matrix(y_act, y_pred))


# 2. Finding other useful model metrics

In [None]:
#other model metrics
print(metrics.classification_report(y_act, y_pred))


# 3. Create an ROC curve

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#Computing the ROC curve
n_classes = y_act.shape[0]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_act[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_act.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

#Plotting the ROC curve

plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()