In [95]:
import re

import numpy as np
import pandas as pd
import datetime as dt
import gc
import random
import sklearn
from sklearn.model_selection import cross_val_score, GridSearchCV, cross_validate, train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, normalize
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer


dis_path = './data/us_disaster_declarations.csv'
temp_path = './data/GlobalLandTemperaturesByState.csv'
states_path = './data/states.csv'

# load list of states
states = {}
with open(states_path) as f:
    next(f)

    for line in f:
        l = line.split(',')
        states[l[0].strip()] = l[1].strip()

# print(states)

# filter disaster dataset
dis_data = pd.read_csv(dis_path)[['state', 'declaration_date', 'incident_type','declaration_title']].rename({'declaration_date': 'date'}, axis=1)
dis_data['date'] = dis_data['date'].astype('datetime64[ns]').dt.strftime('%m-%Y')
dis_data = dis_data.drop_duplicates(subset=['incident_type', 'declaration_title', 'date', 'state'], keep='first')
# dis_data = dis_data.groupby(['state', 'date']).count()
dis_data['disaster_type'] = dis_data['incident_type']
dis_data = dis_data.rename({'incident_type': 'disaster_occurrence'}, axis=1)
dis_data['disaster_occurrence'] = np.ones(dis_data['disaster_occurrence'].shape)
# dis_data = dis_data.reset_index()

# print(dis_data)

dis_data.to_csv('./data/test_disasters_state_month.csv', index=False)

# filter temperature dataset
temp_data = pd.read_csv(temp_path)
temp_data = temp_data[temp_data['Country'] == 'United States'].dropna()  # filter by United States, remove NaNs
temp_data['date'] = temp_data['dt'].astype('datetime64[ns]').dt.strftime('%m-%Y')  # convert string to date, then convert to year
temp_data['state'] = temp_data['State'].apply(lambda x: states[x] if x in states else None)  # preprocess state strings
temp_data = temp_data.dropna()
temp_data = temp_data.groupby(['date', 'state'])  # group by year then state
temp_data = temp_data[['AverageTemperature', 'AverageTemperatureUncertainty']].mean().reset_index()  # take average over groups

# print(temp_data)

temp_data.to_csv('./data/test_temp_state_month.csv', index=False)

# join on `Year` and `State`
df = pd.merge(temp_data, dis_data, on=['date', 'state'], how='left').set_index(['date', 'state'], drop=True)
df.rename({'AverageTemperature': 'ave_temp', 'AverageTemperatureUncertainty': 'ave_temp_uncertainty'}, axis=1, inplace=True)
df = df.fillna(0).reset_index()
df['month'] = df['date'].astype('datetime64[ns]').dt.strftime('%m')
df['year'] = df['date'].astype('datetime64[ns]').dt.strftime('%Y')



df.to_csv('./data/test_disasters_temp_state_month.csv', index=False)

df['y_data'] = df['disaster_occurrence']
df = df.drop(['disaster_occurrence'], axis=1)


df = df[df.year > "1960"]
df = df.drop(['year'], axis=1)

print(df)
df = df.sort_values(by=['date'])
x_data = df.iloc[:,0:-1]
y_data = df.iloc[:,-1]

df = df.drop(['declaration_title','disaster_type'], axis=1)

#x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, train_size=.5, shuffle=False)

train_rows = int(0.6 * x_data.shape[0])
test_rows = x_data.shape[0] - train_rows# - train_rows

x_train = df.iloc[:train_rows, 0:-1]
y_train = df.iloc[:train_rows, -1]
x_test = df.iloc[train_rows:, 0:-1]
y_test = df.iloc[train_rows:, -1]
    
print(y_train)

X = x_data.iloc[:,2:]
y = y_data

logR = LogisticRegression()
logR.fit(x_train.iloc[:,2:], y_train)
y_predict_test = logR.predict_proba(x_test.iloc[:,2:])
y_predict_train = logR.predict_proba(x_train.iloc[:,2:])

x_test['Disaster_Prob'] = y_predict_test[:,1:]


y_predict_test = y_predict_test[:,1:]
test_df = pd.DataFrame(y_predict_test, columns = ['Y_predict'])




LogisticRegression_Test_Per = x_test[['date', 'state', 'Disaster_Prob']]
#print(LogisticRegression_Test_Per.tail(50))

LogisticRegression_Test_Per.to_csv('./data/logRPredictTest.csv', index=False)    
    




           date state  ave_temp  ave_temp_uncertainty declaration_title  \
8797    01-1961    AK   -15.443                 0.434                 0   
8798    01-1961    AL     4.301                 0.207                 0   
8799    01-1961    AR     2.393                 0.281                 0   
8800    01-1961    AZ     5.629                 0.313                 0   
8801    01-1961    CA     7.005                 0.432                 0   
...         ...   ...       ...                   ...               ...   
136388  12-2012    VT    -2.191                 0.343                 0   
136389  12-2012    WA     0.290                 0.307                 0   
136390  12-2012    WI    -3.367                 0.301                 0   
136391  12-2012    WV     4.280                 0.240                 0   
136392  12-2012    WY    -5.374                 0.375                 0   

       disaster_type month  y_data  
8797               0    01     0.0  
8798               0    0

In [75]:
#import matplotlib.pyplot as plt
from sklearn import metrics

def split_data(X, y, test_size = 0.2):       
    import sklearn as sk    
    return sk.model_selection.train_test_split(X, y, test_size=test_size)

X_train, X_test, y_train, y_test = split_data(X, y, test_size = 0.2)

from sklearn.naive_bayes import GaussianNB    
GNBclf = GaussianNB()
classfier = GNBclf.fit(X_train, y_train)

def score_and_predict(classifier, X_train, y_train, X_test, y_test):    
    y_Train_pred = classifier.predict_proba(X_train)
    y_Test_pred = classifier.predict_proba(X_test)
    #testing_score = metrics.accuracy_score(y_test, y_Test_pred)
    #training_score = metrics.accuracy_score(y_train, y_Train_pred)
    return y_Train_pred

y_pred = score_and_predict(classfier, X_train, y_train, X_test, y_test)
#print('Mean accuracy on training data = {} and testing data = {}'.format(training_score.round(2), testing_score.round(2)))
print(y_pred)
def print_classification_report(y_test, y_pred):       
    from sklearn.metrics import classification_report
    return (print(classification_report(y_test,y_pred)))

#print_classification_report(y_test, y_pred)

def roc_curve(classifier, X_test, y_test):      
    from sklearn.metrics import roc_curve
    import matplotlib.pyplot as plt       
    predict_prob = classifier.predict_proba(X_test)
    fpr, tpr, _ = metrics.roc_curve(y_test,predict_prob[:, 1])
    roc_auc = metrics.auc(fpr, tpr)
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')        
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')   
    return (plt.show())

#roc_curve(classifier, X_test, y_test)
from sklearn.svm import SVC
SVMclf = SVC(probability=True) 
SVMclf.fit(X_train, y_train)





[[0.95494471 0.04505529]
 [0.84495342 0.15504658]
 [0.82706004 0.17293996]
 ...
 [0.907349   0.092651  ]
 [0.95457925 0.04542075]
 [0.82788755 0.17211245]]


SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001,
    verbose=False)

# 1. Setting up a confusion matrix

In [None]:
#Load the data

import csv
import pandas as pd
import numpy as np
from sklearn import metrics


with open('data_test.csv', 'r') as file:
    df = csv.reader(file)
    for row in df:
        print(row)
        
df = pd.read_csv('data_test.csv')
        
#Create the confusion matrix
# Importing the dependancies

y_pred = df[df.columns[-1]]
# Actual values
y_act = df[df.columns[2]]

# Printing the confusion matrix
# The columns will show the instances predicted for each label,
# and the rows will show the actual number of instances for each label.

print(y_pred)
# Printing the precision and recall, among other metrics

print(y_act)
#confusion matrix
print(metrics.confusion_matrix(y_act, y_pred))


# 2. Finding other useful model metrics

In [None]:
#other model metrics
print(metrics.classification_report(y_act, y_pred))


# 3. Create an ROC curve

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score

#Computing the ROC curve
n_classes = y_act.shape[0]
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_act[:, i], y_pred[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area
fpr["micro"], tpr["micro"], _ = roc_curve(y_act.ravel(), y_pred.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

#Plotting the ROC curve

plt.figure()
lw = 2
plt.plot(fpr[2], tpr[2], color='darkorange',
         lw=lw, label='ROC curve (area = %0.2f)' % roc_auc[2])
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()