In [104]:
import os 
from dotenv import load_dotenv, find_dotenv
import psycopg2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sys
%matplotlib inline

In [105]:
# walk root diretory to find and load .env file w/ AWS host, username and password
load_dotenv(find_dotenv('fire_var.env'))

True

In [106]:
# connect to postgres
def pgconnect():
    try:
        conn = psycopg2.connect(database=os.environ.get("erdatabase"), user=os.environ.get("eruser"), 
                                password = os.environ.get("erpassword"), 
                                host=os.environ.get("erhost"), 
                                port=os.environ.get('port'))
        return conn

    except psycopg2.Error as e:
        print("I am unable to connect to the database")
        print(e)
        print(e.pgcode)
        print(e.pgerror)
        print(traceback.format_exc())
        return None

In [107]:
#function to to query a table from postgres and create df.
def pquery(QUERY):
    '''
    takes SQL query string, opens a cursor, and executes query in psql
    '''
    conn = pgconnect()
    cur = conn.cursor()
    
    try:
        print("SQL QUERY = "+QUERY)
        cur.execute("SET statement_timeout = 0")
        cur.execute(QUERY)
        # Extract the column names and insert them in header
        col_names = []
        for elt in cur.description:
            col_names.append(elt[0])    
    
        D = cur.fetchall() #convert query result to list
        # Create the dataframe, passing in the list of col_names extracted from the description
        return pd.DataFrame(D, columns=col_names)
        
        
    except Exception as e:
        print(e.pgerror)
            
    finally:
        conn.close()

In [108]:
QUERY1='''SELECT incident.typenaturecode_id,
                 incident.censustract, 
                 incident.fmarespcomp, incident.fireblock,
                 incident.incdate, incsitfoundclass.incsitfoundclass_id, 
                 incsitfoundclass.description as incDescription, 
                 typenaturecode.description as typeDescription
            FROM incident
            INNER JOIN incsitfound
                ON incident.incsitfoundprm_id = incsitfound.incsitfound_id
            LEFT JOIN incsitfoundsub
                ON incsitfound.incsitfoundsub_id = incsitfoundsub.incsitfoundsub_id
            LEFT JOIN incsitfoundclass
                ON incsitfoundsub.incsitfoundclass_id = incsitfoundclass.incsitfoundclass_id
            LEFT JOIN typenaturecode
               ON incident.typenaturecode_id = typenaturecode.typenaturecode_id;
'''
df = pquery(QUERY1)

SQL QUERY = SELECT incident.typenaturecode_id,
                 incident.censustract, 
                 incident.fmarespcomp, incident.fireblock,
                 incident.incdate, incsitfoundclass.incsitfoundclass_id, 
                 incsitfoundclass.description as incDescription, 
                 typenaturecode.description as typeDescription
            FROM incident
            INNER JOIN incsitfound
                ON incident.incsitfoundprm_id = incsitfound.incsitfound_id
            LEFT JOIN incsitfoundsub
                ON incsitfound.incsitfoundsub_id = incsitfoundsub.incsitfoundsub_id
            LEFT JOIN incsitfoundclass
                ON incsitfoundsub.incsitfoundclass_id = incsitfoundclass.incsitfoundclass_id
            LEFT JOIN typenaturecode
               ON incident.typenaturecode_id = typenaturecode.typenaturecode_id;



In [109]:
df.head(20)

Unnamed: 0,typenaturecode_id,censustract,fmarespcomp,fireblock,incdate,incsitfoundclass_id,incdescription,typedescription
0,30.0,8202.0,7.0,0727,2015-05-23,3,MEDICAL AID / RESCUE CALLS,BREATHING/1ST RESP
1,141.0,7300.0,,1404R,2015-06-06,3,MEDICAL AID / RESCUE CALLS,TRAUMA/1ST RESP
2,145.0,3601.0,14.0,1412,2015-05-01,3,MEDICAL AID / RESCUE CALLS,UNKNOWN/FIRE ONLY
3,12.0,2100.0,13.0,1387,2016-05-06,2,"OVER PRESSURE RUPTURE, EXPLOSION, OVERHEAT",AUTOMATIC FIRE ALARM-COMM
4,140.0,,12.0,1248,2016-08-26,3,MEDICAL AID / RESCUE CALLS,TRAUMA/1ST RESP
5,134.0,,,FP,2016-10-01,3,MEDICAL AID / RESCUE CALLS,TRAFFIC ACC/1ST RESP
6,134.0,5600.0,4.0,L0038,2010-06-13,3,MEDICAL AID / RESCUE CALLS,TRAFFIC ACC/1ST RESP
7,110.0,5100.0,1.0,0109,2010-06-19,3,MEDICAL AID / RESCUE CALLS,OVERDOSE/1ST RESP
8,21.0,9202.0,7.0,0754,2010-07-10,3,MEDICAL AID / RESCUE CALLS,ASSAULT/1ST RESP
9,30.0,,21.0,2151,2011-05-14,3,MEDICAL AID / RESCUE CALLS,BREATHING/1ST RESP


In [111]:
#Convert date column to actual date. Calc a year and month column from the date column.
# Borrow Ryan's code here.

df['incdate'] = pd.to_datetime(df['incdate'])
df['month'] = pd.DatetimeIndex(df['incdate']).month
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512281 entries, 0 to 512280
Data columns (total 9 columns):
typenaturecode_id      512278 non-null float64
censustract            510353 non-null object
fmarespcomp            507765 non-null object
fireblock              512186 non-null object
incdate                512281 non-null datetime64[ns]
incsitfoundclass_id    512281 non-null int64
incdescription         512281 non-null object
typedescription        512278 non-null object
month                  512281 non-null int32
dtypes: datetime64[ns](1), float64(1), int32(1), int64(1), object(5)
memory usage: 33.2+ MB


In [115]:
# Make labels
df_ = df.copy()
df_['false'] =0
df_.loc[df_['incsitfoundclass_id']== 7,'false']=1

In [116]:
# Make X and y
y = df_['false']
X = df_.loc[:,['typenaturecode_id','censustract', 'fmarespcomp', 'fireblock', 'month' ]]

In [117]:
# Encode categorical variables
X_with_dummies = pd.get_dummies(X, prefix= list(X), prefix_sep='_',dummy_na =1,columns=list(X),sparse=1,drop_first=1)
X_with_dummies.shape

(512281, 1518)

In [118]:
# Load classifier module

from sklearn.ensemble import RandomForestClassifier as rf
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

In [119]:
# Prepare for training data and testing data 
X_train, X_test, y_train, y_test = train_test_split(X_with_dummies, y, test_size=0.3, random_state=1)

In [95]:
# Give it a try
clf = rf(n_estimators=60, max_features=5)
scores = cross_val_score(clf,X_train, y_train)
scores.mean()


0.96436937647944487

In [97]:
# Get a classifier 
clf =  rf(n_estimators=60, max_features=5, n_jobs=-1, random_state= 1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=-1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [136]:
features= clf.feature_importances_ 
ind = list(X_train)
feature_importance = pd.DataFrame({'ind':ind,'features':features}).sort_values('features',ascending=0)
feature_importance[:20]


Unnamed: 0,features,ind
7,0.452517,typenaturecode_id_12.0
8,0.084879,typenaturecode_id_13.0
6,0.053344,typenaturecode_id_11.0
100,0.0189,typenaturecode_id_145.0
97,0.014908,typenaturecode_id_141.0
23,0.012125,typenaturecode_id_30.0
31,0.011711,typenaturecode_id_39.0
84,0.009907,typenaturecode_id_125.0
28,0.009846,typenaturecode_id_36.0
1514,0.008217,month_10.0


## Discussion
*It looks like typenaturecode is the dominant factror on predicting the false alarm. I included it in predictive variables, assuming it is generated when dispatch occures, if I am wrong, then it is another story.*

*I will go ahead to extract the 9 most important typenaturecodes. The easiest way is to re-apply the pd.get_dummies() function and re-run the whole thing.*

In [143]:
s = pd.Series(X['typenaturecode_id'])

In [147]:
X_with_dummies1 = pd.get_dummies(s, dummy_na =1,drop_first=1)
X_with_dummies1.shape

(512281, 227)

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X_with_dummies1, y, test_size=0.3, random_state=1)

In [150]:
clf =  rf(n_estimators=60, max_features=5, n_jobs=-1, random_state= 1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=-1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [152]:
## Check the score on test set
clf.score(X_test,y_test)

0.97190356898851549

In [155]:
features= clf.feature_importances_ 
ind = list(X_train)
feature_importance = pd.DataFrame({'typenaturecode_id':ind,'features':features}).sort_values('features',ascending=0)
feature_importance[:10]

Unnamed: 0,features,typenaturecode_id
7,0.688002,12.0
8,0.123934,13.0
6,0.070648,11.0
100,0.015531,145.0
97,0.013129,141.0
23,0.011611,30.0
31,0.009282,39.0
28,0.008326,36.0
84,0.007676,125.0
72,0.004806,110.0


## Discussion

*OK, above is the top 10 typenaturecode_id that are related to false alarm. The results seems reasonable. Let's take a look about the fireblock.*

In [156]:
s = pd.Series(X['fireblock'])

In [157]:
X_with_dummies2 = pd.get_dummies(s, dummy_na =1,drop_first=1)
X_with_dummies2.shape

(512281, 931)

In [158]:
X_train, X_test, y_train, y_test = train_test_split(X_with_dummies2, y, test_size=0.3, random_state=1)

In [159]:
clf =  rf(n_estimators=60, max_features=5, n_jobs=-1, random_state= 1)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=5, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=60, n_jobs=-1, oob_score=False, random_state=1,
            verbose=0, warm_start=False)

In [160]:
## Check the score on test set
clf.score(X_test,y_test)

0.92322607931808565

In [161]:
features= clf.feature_importances_ 
ind = list(X_train)
feature_importance = pd.DataFrame({'fireblock':ind,'features':features}).sort_values('features',ascending=0)
feature_importance[:10]

Unnamed: 0,features,fireblock
87,0.106909,0451
439,0.042055,2209
302,0.033282,1406
358,0.028563,1706
470,0.025504,2403
272,0.02357,1318
802,0.019721,FP
122,0.017969,0621
58,0.016852,0342
121,0.01562,0618


## Conclusion

*So let's check the auto-alarms in these fireblocks. Hopefully this is useful information to help Portland Fire & Rescue to decrease the False Alarm rate.*
