In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import mgrs
import sqlite3
import math
import mgrs
from collections import Counter
from itertools import product
import statistics
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall
from scikitplot.metrics import plot_cumulative_gain
from scikitplot.metrics import plot_lift_curve
from numpy import argmax
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
def build_and_test(X_tr, X_te, y_tr, y_te, class_weight=None, threshold=False):
    
    # Build and Plot PCA
    pca = PCA(n_components=2)
    # pca.fit(X_tr.toarray())
    # X_pca = pca.transform(X_tr.toarray())

    pca.fit(X_tr)
    X_pca = pca.transform(X_tr)

    plt.scatter(X_pca[:, 0], X_pca[:, 1], c=y_tr, cmap=plt.cm.prism, edgecolor='k', alpha=0.7)
    plt.show()
    
    # Build and fit the model
    # if class_weight:
    #     model = DecisionTreeClassifier(class_weight=class_weight)
    # else:
        # model = DecisionTreeClassifier()
    # model = RandomForestClassifier()
    model = GradientBoostingClassifier()
    model.fit(X_tr, y_tr)
    
    # Test the model
    y_pred = model.predict(X_te)
    print('Precision score %s' % precision_score(y_te, y_pred))
    print('Recall score %s' % recall_score(y_te, y_pred))
    print('F1-score score %s' % f1_score(y_te, y_pred))
    print('Accuracy score %s' % accuracy_score(y_te, y_pred))
    
    y_score = model.predict_proba(X_te)
    fpr0, tpr0, thresholds = roc_curve(y_te, y_score[:, 1])
    roc_auc0 = auc(fpr0, tpr0)
    
    # Calculate the best threshold
    best_threshold = None
    if threshold:
        J = tpr0 - fpr0
        ix = argmax(J) # take the value which maximizes the J variable
        best_threshold = thresholds[ix]
        # adjust score according to threshold.
        y_score = np.array([[1, y[1]] if y[0] >= best_threshold else [0, y[1]] for y in y_score])
        
    
    # Plot metrics 
    plot_roc(y_te, y_score)
    plt.show()
    
    plot_precision_recall(y_te, y_score)
    plt.show()
    
    plot_cumulative_gain(y_te, y_score)
    plt.show()
    
    plot_lift_curve(y_te, y_score)
    plt.show()
    
    # Print a classification report
    print(classification_report(y_te,y_pred))
    return roc_auc0,fpr0,tpr0, best_threshold

In [None]:
data_clean = pd.read_csv('socal_fires_weather_mgrs_lag.csv')
data_clean.info()

In [None]:
print(
  'is_fire      ' + str(np.sum(data_clean['is_fire']))
, '\nis_fire_lag1 ' + str(np.sum(data_clean['is_fire_lag1']))
, '\nis_fire_lag2 ' + str(np.sum(data_clean['is_fire_lag2']))
, '\nis_fire_lag3 ' + str(np.sum(data_clean['is_fire_lag3']))
, '\nis_fire_lag4 ' + str(np.sum(data_clean['is_fire_lag4']))
, '\nis_fire_lag5 ' + str(np.sum(data_clean['is_fire_lag5']))
)

In [None]:
data_clean = data_clean.sort_values(by=['mgrs_10km','date'])
data_clean.reset_index(drop=True,inplace=True)
data_clean.drop(['mgrs_10km','date'],axis=1,inplace=True)
data_clean.head()

In [None]:
data_clean[data_clean['is_fire']==1]

In [None]:
data_clean[17475:17483]

In [None]:
data_clean.info()

In [None]:
sample_size = 0.33
stratified_sample, _ = train_test_split(data_clean,test_size = (1-sample_size), stratify=data_clean[['is_fire']], random_state = 8)
# print(stratified_sample)

stratified_sample = pd.DataFrame(stratified_sample)
# stratified_sample.drop(['mgrs_100km'], axis=1, inplace=True)
stratified_sample.reset_index(drop=True, inplace=True)
stratified_sample.head()

In [None]:
print(np.sum(stratified_sample.is_fire) / len(stratified_sample))
print(np.sum(data_clean.is_fire) / len(data_clean))

In [None]:
X = stratified_sample.copy()
y = X['is_fire']
yLag1 = X['is_fire_lag1']
yLag2 = X['is_fire_lag2']
yLag3 = X['is_fire_lag3']
yLag4 = X['is_fire_lag4']
yLag5 = X['is_fire_lag5']
X.drop(['is_fire','is_fire_lag1','is_fire_lag2','is_fire_lag3','is_fire_lag4','is_fire_lag5'], axis = 1, inplace = True)
print(X.shape, y.shape)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X,yLag1,test_size = 0.33, random_state = 8)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

In [None]:
roc_auc_imb,fpr_imb,tpr_imb, _ = build_and_test(np.array(X_train), np.array(X_test), np.array(y_train), np.array(y_test))

In [None]:
# from imblearn.over_sampling import RandomOverSampler
# over_sampler = RandomOverSampler(random_state=42)
# X_res, y_res = over_sampler.fit_resample(X_train, y_train)
# print(f"Training target statistics: {Counter(y_res)}")
# print(f"Testing target statistics: {Counter(y_test)}")

In [None]:
# roc_auc_ros,fpr_ros,tpr_ros, _ = build_and_test(X_res, X_test, y_res, y_test)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
under_sampler = RandomUnderSampler(random_state=42)
X_res, y_res = under_sampler.fit_resample(X_train, y_train)
print(f"Training target statistics: {Counter(y_res)}")
print(f"Testing target statistics: {Counter(y_test)}")

In [None]:
roc_auc_rus,fpr_rus,tpr_rus , _ = build_and_test(X_res, X_test, y_res, y_test)