# Multiple logistic regression

First, let's try a regression model. We will need to break the data into input/output pairs where the input is the disaster vector for a given state and month and the output is the next month's disaster vector for the same state.

In [1]:
# Standard library imports
from typing import Tuple

# PyPI imports
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression

# Input data
data_file='../data/resampled_disaster_data_1998-current.parquet'

## 1. Data loading

In [2]:
data_df=pd.read_parquet(data_file)
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16384 entries, 0 to 16383
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   state            16384 non-null  object
 1   Coastal Storm    16384 non-null  int32 
 2   Dam/Levee Break  16384 non-null  int32 
 3   Drought          16384 non-null  int32 
 4   Fire             16384 non-null  int32 
 5   Flood            16384 non-null  int32 
 6   Hurricane        16384 non-null  int32 
 7   Mud/Landslide    16384 non-null  int32 
 8   Severe Storm     16384 non-null  int32 
 9   Tornado          16384 non-null  int32 
 10  Tropical Storm   16384 non-null  int32 
 11  Tsunami          16384 non-null  int32 
 12  Typhoon          16384 non-null  int32 
 13  Winter weather   16384 non-null  int32 
 14  No incident      16384 non-null  int32 
 15  year             16384 non-null  int32 
 16  month            16384 non-null  int32 
dtypes: int32(16), object(1)
memory 

In [3]:
indexes=[
    data_df['year'].tolist(),
    data_df['state'].tolist(),
    data_df['month'].tolist()
]

index_tuples=list(zip(*indexes))
index=pd.MultiIndex.from_tuples(index_tuples, names=['Year', 'State', 'Month'])
data_df.set_index(index, inplace=True)

# Drop unnecessary columns
data_df.drop(['year', 'state', 'month'], axis=1, inplace=True)
data_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Coastal Storm,Dam/Levee Break,Drought,Fire,Flood,Hurricane,Mud/Landslide,Severe Storm,Tornado,Tropical Storm,Tsunami,Typhoon,Winter weather,No incident
Year,State,Month,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1998,AK,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1998,AK,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1998,AK,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1998,AK,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1998,AK,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 3. Data generator function

To test model performance we need a function to yield batches of data for regression modeling.

In [4]:
def generate_data(data_df: pd.DataFrame, input_window: int) -> Tuple[np.array, np.array]:
    '''Takes dataframe, input window size, parses data into feature label pairs,
    returns as tuple of numpy arrays'''

    # Get list of years
    years=data_df.index.get_level_values('Year').unique().tolist()

    features=[]
    labels=[]

    # Loop on seasons
    for year in years:

        # Extract the data for this year
        year_df=data_df.loc[(year)]
        
        # Get the list of player for this season
        states=year_df.index.get_level_values('State').unique().tolist()

        # loop on the players
        for state in states:

            # Extract the data for this player
            state_df=year_df.loc[(state)]

            # Indexing variable for batch
            input_start_index=0

            # Loop on the player data
            while input_start_index + input_window + 1 < len(state_df):

                # Extract and collect the features and labels
                feature_row=state_df.iloc[input_start_index:input_start_index + input_window]
                label_row=state_df.iloc[input_start_index + input_window]
                features.append(feature_row.values.tolist())
                labels.append(label_row.values.tolist())

                # Update the index
                input_start_index+=input_window + 1

    # Convert to numpy arrays
    features=np.array(features)
    labels=np.array(labels)

    # Squeeze out the extra dimension for window width of 1
    if input_window == 1:
        features=features.squeeze(axis=1)

    return features, labels

## 3. Logistic regression model

In [5]:
# Generate some feature, label pairs
input_window=1
features, labels=generate_data(data_df, input_window)

# Split them into training and validation
training_features, testing_features, training_labels, testing_labels=train_test_split(features, labels)

print(f'Features: {training_features.shape}')
print(f'Labels: {training_labels.shape}')

Features: (5116, 14)
Labels: (5116, 14)


In [6]:
features[1]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [None]:
model=MultiOutputClassifier(LogisticRegression()).fit(training_features, training_labels)
predictions=model.predict(testing_features)