In [1]:
from pandas.tseries.holiday import USFederalHolidayCalendar
import pandas as pd
from sklearn import base
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import FeatureUnion
from sklearn import linear_model
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics
from tqdm import tqdm
from sklearn import model_selection
from sklearn.utils import shuffle
import random
from sklearn.externals import joblib
from datetime import datetime

# Import the fire related data

df_true = pd.read_csv('fires_final.csv', parse_dates = ['startDate'])
df_false = pd.read_csv('false_data.csv', parse_dates = ['startDate'])

df = df_false.append(df_true[df_false.columns], ignore_index=True)

holidays = USFederalHolidayCalendar().holidays(start=datetime(1969,12,31), 
                                               end=datetime(2100,12,31)
                                              )

# Clean data before building machine learning
# Asume that there is no wind if not reported

df.fillna(value = {'maxWind': 0}, inplace = True)

# Eliminate the rows with missing values

df.dropna(inplace=True)
df.describe()

# Define classes for machine learning

class ColumnSelectTransformer(base.BaseEstimator, base.TransformerMixin):
    
    def __init__(self, colnames):
        
        self.cols = colnames
        
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        return np.array(X[self.cols].values.tolist())

  
class DayProcessor(base.BaseEstimator, base.TransformerMixin):
    
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        dates = pd.DatetimeIndex([i[0] for i in X])
        
        holidays = USFederalHolidayCalendar().holidays(start=datetime(1969,12,31), 
                                                       end=datetime(2100,12,31))
        
        a = np.array([x in holidays for x in list(dates)])
        b = np.array([x.dayofweek in [5,6] for x in list(dates)])
        
        return np.stack((a,b), axis=1)
    
class MonthProcessor(base.BaseEstimator, base.TransformerMixin):
    
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        dates = pd.DatetimeIndex([i[0] for i in X])
        
        return [{x.month: 1} for x in list(dates)]
    
class CountyDicGenerator(base.BaseEstimator, base.TransformerMixin):
    
    def fit(self, X, y=None):
        
        return self
    
    def transform(self, X):
        
        return [{''.join(x): 1} for x in X.tolist()]
    
class ThresholdEstimator(base.BaseEstimator, base.RegressorMixin):
    
    def __init__(self, model, r):
        
        self.model = model
        self.r = r
    
    def fit(self, X, y):
        
        self.model.fit(X,y)
        
    def predict(self, X):
        
        return [True if k[1] > self.r else False for k in self.model.predict_proba(X)]
    
day_features = Pipeline([
                         ('date',ColumnSelectTransformer(['startDate'])),
                         ('day',DayProcessor())
                        ])

month_vectorizer = Pipeline([
                             ('data',ColumnSelectTransformer(['startDate'])),
                             ('month',MonthProcessor()),
                             ('vectorizer', DictVectorizer(sparse = False))
                            ])

    
county_vectorizer = Pipeline([
                              ('county',ColumnSelectTransformer(['county'])),
                              ('dict', CountyDicGenerator()),
                              ('vectorizer', DictVectorizer(sparse = False))
                             ])

weather_variables = ['avgHumidity', 'dewPoint', 'maxHumidity', 'maxTemp',
                     'maxWind', 'meanTemp', 'minHumidity', 'minTemp']

label = np.array(df['fire'])

features = FeatureUnion([
        ('date',day_features),
        ('month',month_vectorizer),
        ('county',county_vectorizer),
        ('weather',ColumnSelectTransformer(weather_variables))
    ])


model_final = Pipeline([
                  ('features',features),
                  ('tree',DecisionTreeClassifier(min_samples_leaf = 32))
                 ])
    
estimator = ThresholdEstimator(model_final,0.1)
estimator.fit(df,label)

results = estimator.predict(df)
metrics.confusion_matrix(label, results)

# Save model

joblib.dump(estimator, 'model.pkl', protocol = 1)

['model.pkl']

In [2]:
a = joblib.load('model.pkl')

In [3]:
a == estimator

False

In [4]:
a

ThresholdEstimator(model=Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('date', Pipeline(steps=[('date', ColumnSelectTransformer(colnames=None)), ('day', DayProcessor())])), ('month', Pipeline(steps=[('data', ColumnSelectTransformer(colnames=None)), ('month', MonthProcessor()), ('vectorizer', DictVecto...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
          r=0.1)

In [5]:
estimator

ThresholdEstimator(model=Pipeline(steps=[('features', FeatureUnion(n_jobs=1,
       transformer_list=[('date', Pipeline(steps=[('date', ColumnSelectTransformer(colnames=None)), ('day', DayProcessor())])), ('month', Pipeline(steps=[('data', ColumnSelectTransformer(colnames=None)), ('month', MonthProcessor()), ('vectorizer', DictVecto...it=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best'))]),
          r=0.1)