In [179]:
import re
import numpy as np
import pandas as pd
import datetime as dt
import gc
import random
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report, average_precision_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
import matplotlib.pyplot as plt
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import plot_precision_recall_curve

from sklearn.naive_bayes import GaussianNB    


dis_path = './data/us_disaster_declarations.csv'
temp_path = './data/GlobalLandTemperaturesByState.csv'
states_path = './data/states.csv'

# load list of states
states = {}
with open(states_path) as f:
    next(f)

    for line in f:
        l = line.split(',')
        states[l[0].strip()] = l[1].strip()

# print(states)

# filter disaster dataset
dis_data = pd.read_csv(dis_path)[['state', 'declaration_date', 'incident_type','declaration_title']].rename({'declaration_date': 'date'}, axis=1)
dis_data['date'] = dis_data['date'].astype('datetime64[ns]').dt.strftime('%m-%Y')
dis_data = dis_data.drop_duplicates(subset=['incident_type', 'declaration_title', 'date', 'state'], keep='first')
# dis_data = dis_data.groupby(['state', 'date']).count()
dis_data['disaster_type'] = dis_data['incident_type']
dis_data = dis_data.rename({'incident_type': 'disaster_occurrence'}, axis=1)
dis_data['disaster_occurrence'] = np.ones(dis_data['disaster_occurrence'].shape)
# dis_data = dis_data.reset_index()

# print(dis_data)

dis_data.to_csv('./data/test_disasters_state_month.csv', index=False)

# filter temperature dataset
temp_data = pd.read_csv(temp_path)
temp_data = temp_data[temp_data['Country'] == 'United States'].dropna()  # filter by United States, remove NaNs
temp_data['date'] = temp_data['dt'].astype('datetime64[ns]').dt.strftime('%m-%Y')  # convert string to date, then convert to year
temp_data['state'] = temp_data['State'].apply(lambda x: states[x] if x in states else None)  # preprocess state strings
temp_data = temp_data.dropna()
temp_data = temp_data.groupby(['date', 'state'])  # group by year then state
temp_data = temp_data[['AverageTemperature', 'AverageTemperatureUncertainty']].mean().reset_index()  # take average over groups

# print(temp_data)

temp_data.to_csv('./data/test_temp_state_month.csv', index=False)

# join on `Year` and `State`
df = pd.merge(temp_data, dis_data, on=['date', 'state'], how='left').set_index(['date', 'state'], drop=True)
df.rename({'AverageTemperature': 'ave_temp', 'AverageTemperatureUncertainty': 'ave_temp_uncertainty'}, axis=1, inplace=True)
df = df.fillna(0).reset_index()
df['month'] = df['date'].astype('datetime64[ns]').dt.strftime('%m')
df['year'] = df['date'].astype('datetime64[ns]').dt.strftime('%Y')
df['date'] = pd.to_datetime(df.date)


df.to_csv('./data/test_disasters_temp_state_month.csv', index=False)

df['y_data'] = df['disaster_occurrence']
df = df.drop(['disaster_occurrence'], axis=1)


df = df[df.year > "1960"]
df = df.drop(['year'], axis=1)

df = df.sort_values(by=['date'])
x_data = df.iloc[:,0:-1]
y_data = df.iloc[:,-1]

df = df.drop(['declaration_title','disaster_type'], axis=1)




In [180]:
#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=.5, shuffle=False)

train_rows = int(0.6 * x_data.shape[0])
test_rows = x_data.shape[0] - train_rows# - train_rows

x_train = df.iloc[:train_rows, 0:-1]
y_train = df.iloc[:train_rows, -1]
x_test = df.iloc[train_rows:, 0:-1]
y_test = df.iloc[train_rows:, -1]
    
X = x_data.iloc[:,2:]
y = y_data

x_test_plot = x_test.iloc[:,2:]

logR = LogisticRegression()
logR.fit(x_train.iloc[:,2:], y_train)
y_predict_test = logR.predict_proba(x_test.iloc[:,2:])
y_predict_train = logR.predict_proba(x_train.iloc[:,2:])

y_predict_test = y_predict_test[:,1:]

  
x_test['Disaster_Prob'] = y_predict_test
x_test['y_data'] = y_data

test_df = pd.DataFrame(y_predict_test, columns = ['Y_predict'])

test_actual = df.iloc[test_rows:,:]

test_actual=test_actual[['date', 'state', 'y_data']]
LogisticRegression_Test_Per = x_test[['date', 'state', 'Disaster_Prob']]
logResgressionPerTestCombined = x_test[['date', 'state', 'Disaster_Prob', 'y_data']]


logResgressionPerTestCombined.to_csv('./data/logResgressionPerTestCombined.csv', index=False)                        
test_actual.to_csv('./data/test_actual.csv', index=False)
LogisticRegression_Test_Per.to_csv('./data/logRPredictTest.csv', index=False)  


In [195]:
from sklearn.metrics import confusion_matrix

prc = []

threshold = 0
y_test = y_test.reset_index(drop=True)

for i in range(0,25):
    y_predict_test_no_prob = np.where(y_predict_test >= threshold, 1, 0)

    tn, fp, fn, tp = confusion_matrix(y_test, y_predict_test_no_prob).ravel()
    fpr = fp / (fp + tn)
    tpr = tp / (tp + fn)
    
    prc.append([threshold, tn, fp, fn, tp, fpr, tpr])
    threshold = threshold + .005

prc_df = pd.DataFrame(prc, columns =['Threshold', 'tn', 'fp', 'fn', 'tp','fpr', 'tpr']) 
print(prc_df)
prc_df.to_csv('./data/prc_df.csv', index=False) 


test_accuracy = accuracy_score(y_test, y_predict_test_no_prob)
test_avg_pers = average_precision_score(y_test, y_predict_test_no_prob)





    Threshold     tn     fp    fn    tp       fpr       tpr
0       0.000      0  10450     0  2237  1.000000  1.000000
1       0.005      0  10450     0  2237  1.000000  1.000000
2       0.010      5  10445     0  2237  0.999522  1.000000
3       0.015     46  10404     7  2230  0.995598  0.996871
4       0.020    287  10163    19  2218  0.972536  0.991506
5       0.025    773   9677    54  2183  0.926029  0.975861
6       0.030   1403   9047   101  2136  0.865742  0.954850
7       0.035   2145   8305   169  2068  0.794737  0.924452
8       0.040   3008   7442   259  1978  0.712153  0.884220
9       0.045   3893   6557   358  1879  0.627464  0.839964
10      0.050   4862   5588   519  1718  0.534737  0.767993
11      0.055   5750   4700   653  1584  0.449761  0.708091
12      0.060   6690   3760   864  1373  0.359809  0.613768
13      0.065   7604   2846  1141  1096  0.272344  0.489942
14      0.070   8376   2074  1447   790  0.198469  0.353152
15      0.075   9032   1418  1690   547 

In [75]:

GNBclf = GaussianNB()
classfier = GNBclf.fit(x_train.iloc[:,2:], y_train)
y_Train_pred_Gaus = GNBclf.predict_proba(x_train.iloc[:,2:])
y_predict_test_Gaus = GNBclf.predict_proba(x_test.iloc[:,2:])

y_predict_test_Gaus = y_predict_test_Gaus[:,1:]
print(y_predict_test_Gaus)




[[0.95494471 0.04505529]
 [0.84495342 0.15504658]
 [0.82706004 0.17293996]
 ...
 [0.907349   0.092651  ]
 [0.95457925 0.04542075]
 [0.82788755 0.17211245]]


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

# 1. Setting up a confusion matrix

In [None]:
#Load the data

import csv
import pandas as pd
import numpy as np
from sklearn import metrics


with open('data_test.csv', 'r') as file:
    df = csv.reader(file)
    for row in df:
        print(row)
        
df = pd.read_csv('data_test.csv')
        
#Create the confusion matrix
# Importing the dependancies

y_pred = df[df.columns[-1]]
# Actual values
y_act = df[df.columns[2]]

# Printing the confusion matrix
# The columns will show the instances predicted for each label,
# and the rows will show the actual number of instances for each label.

print(y_pred)
# Printing the precision and recall, among other metrics

print(y_act)
#confusion matrix
print(metrics.confusion_matrix(y_act, y_pred))


# 2. Finding other useful model metrics

In [None]:
#other model metrics
print(metrics.classification_report(y_act, y_pred))


# 3. Create an ROC curve

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#Computing the ROC curve
n_classes = y_act.shape[0]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_act[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_act.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

#Plotting the ROC curve

plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()