In [None]:
#This notebook should run on Google Colab
#Run this section on Colab, and then restart runtime
!sudo apt-get install build-essential swig 
!pip install auto-sklearn==0.11.1

In [None]:
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder
import numpy as np

## frequency of monthly wildfire by states ##

In [None]:
#The needed csv should be in google drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/733project/etl_filtered_v1.csv')
df['month'] = pd.DatetimeIndex(df['datetime']).month
df

In [None]:
df.columns

In [None]:
mean_pd = df[df['population_density']>=0].groupby('STATE')['population_density'].mean().reset_index()
ml_df = df[['month', 'STATE', 'TMAX', 'TMIN', 'AWND']]
ml_df = ml_df.groupby(['month', 'STATE']).agg({'month' : ['count'], 'TMAX' : 'mean', 'TMIN' : 'mean', 'AWND' : 'mean'}).reset_index()
ml_df.columns = [' '.join(col).strip() for col in ml_df.columns.values]
merged = ml_df.merge(mean_pd, how='inner', left_on='STATE', right_on='STATE')
encoding = pd.get_dummies(merged.STATE)
merged = merged.join(encoding)
merged

In [None]:
import autosklearn.regression 
from sklearn.model_selection import train_test_split

In [None]:
y = merged['month count'].to_frame()
X = merged.drop(['month count', 'STATE'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
automl = autosklearn.regression.AutoSklearnRegressor(
    time_left_for_this_task=120,
    per_run_time_limit=30,
    tmp_folder='/content/tmp'
)
automl.fit(X_train, y_train)

In [None]:
import sklearn
train_predictions = automl.predict(X_train)
print("Training R2 score:", sklearn.metrics.r2_score(y_train, train_predictions))
train_predictions = automl.predict(X_test)
print("Testing R2 score:", sklearn.metrics.r2_score(y_test, train_predictions))

In [None]:
automl.show_models()

In [None]:
#predict
from sklearn.linear_model import LinearRegression
temp = df[['FIRE_YEAR', 'month', 'STATE', 'AWND', 'TMAX', 'TMIN']]
AWND_predict = temp.groupby(['FIRE_YEAR', 'month', 'STATE']).agg({'AWND' : 'mean'}).reset_index()
encoding = pd.get_dummies(AWND_predict.STATE)
AWND_predict = AWND_predict.join(encoding)
y = AWND_predict['AWND']
x = AWND_predict.drop(['STATE', 'AWND'], axis=1)
AWND_model = LinearRegression().fit(x, y)

In [None]:
TMAX_predict = temp.groupby(['FIRE_YEAR', 'month', 'STATE']).agg({'TMAX' : 'mean'}).reset_index()
encoding = pd.get_dummies(TMAX_predict.STATE)
TMAX_predict = TMAX_predict.join(encoding)
y = TMAX_predict['TMAX']
x = TMAX_predict.drop(['STATE', 'TMAX'], axis=1)
TMAX_model = LinearRegression().fit(x, y)

In [None]:
TMIN_predict = temp.groupby(['FIRE_YEAR', 'month', 'STATE']).agg({'TMIN' : 'mean'}).reset_index()
encoding = pd.get_dummies(TMIN_predict.STATE)
TMIN_predict = TMIN_predict.join(encoding)
y = TMIN_predict['TMIN']
x = TMIN_predict.drop(['STATE', 'TMIN'], axis=1)
TMIN_model = LinearRegression().fit(x, y)

In [None]:
states = set(df['STATE'].to_list())
states = list(states)
states.sort()
year = [2022]
months = list(range(1,13))
year_df = pd.DataFrame(year, columns=['FIRE_YEAR'])
month_df = pd.DataFrame(months, columns=['month_'])
State_df = pd.DataFrame(states, columns=['STATE'])
raw = year_df.merge(month_df, how='cross')
raw = raw.merge(State_df, how='cross')

encoding = pd.get_dummies(raw.STATE)
input_df = raw.join(encoding)
input_df = input_df.drop(['STATE'], axis=1)
input_df

In [None]:
raw

In [None]:
AWND_2022 = AWND_model.predict(input_df)
TMIN_2022 = TMIN_model.predict(input_df)
TMAX_2022 = TMAX_model.predict(input_df)

In [None]:
raw['TMAX_mean'] = TMAX_2022
raw['TMIN_mean'] = TMIN_2022
raw['AWND_mean'] = AWND_2022
mean_pd = df[df['population_density']>=0].groupby('STATE')['population_density'].mean().reset_index()
raw = raw.merge(mean_pd, how='inner', on='STATE')
raw.join(pd.get_dummies(raw.STATE))
input_2022 = raw.join(pd.get_dummies(raw.STATE))
input_2022
input_2022 = input_2022.drop(['STATE', 'FIRE_YEAR'], axis=1)
input_2022

In [None]:
frequency_2022 = automl.predict(input_2022)
raw['frequency'] = frequency_2022
raw['frequency'] = raw['frequency'].apply(lambda x: int(x) if x>=0 else 0)
raw[['FIRE_YEAR', 'month_', 'STATE', 'frequency']].to_csv('frequency_2022.csv')