### This notebook is the second step/final step toward predicting the event type



__Main Steps__

* _Libraries_: Run requirements.txt to install the required libraries if you haven't already. It is recommended to use a venv.

* _Run first step to generate data_: Run 1_CreateInput.ipynb file to generate the input for our prediction model.

* _Vectorize the data_: The csv data is encoded into vector format.

* _LSTM Model_: The LSTM model is used to perform prediction.

#### Common Imports

In [None]:
from common import cities, city_keys, years, months
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import LSTM, Dense
import pandas as pd
import numpy as np
import os

#### Encoding data for the model and splitting it

In [2]:
def add_sincos_column(df, key, data):
    df[f'{key}_sin'] = np.sin(2 * np.pi * data)
    df[f'{key}_cos'] = np.cos(2 * np.pi * data)

def parse_time(df: pd.DataFrame):
    time_cols = ['StartTime', 'EndTime', 'Sunrise', 'Sunset']
    for column in time_cols:
        if column in df.columns:
            if not np.issubdtype(df[column].dtype, np.datetime64):
                df[column] = pd.to_datetime(df[column])
        else:
            continue

        if 'Sun' not in column:
            days = df[column].dt.day
            add_sincos_column(df, f'{column}_day', (days / 31))

            months = df[column].dt.month
            add_sincos_column(df, f'{column}_month', (months / 12))

            years = df[column].dt.year
            df[f'{column}_year_norm'] = (years - years.min()) / (years.max() - years.min())
        
        seconds = df[column].dt.hour * 3600 + df[column].dt.minute * 60 + df[column].dt.second
        add_sincos_column(df, f'{column}_seconds', (seconds / 86400))

    df = df.sort_values(by=['LocationLat', 'LocationLng', 'StartTime', 'EndTime'])
    for column in time_cols:
        df.drop(column, axis=1, inplace=True)
    return df

In [3]:
def split_data(df, train_fraction=0.8):
    cut_index = int(len(df) * train_fraction)
    train_df = df.iloc[:cut_index].reset_index(drop=True)
    val_df = df.iloc[cut_index:].reset_index(drop=True)
    return train_df, val_df

In [4]:
def feature_scaling(X_train, X_val):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    return X_train_scaled, X_val_scaled, scaler

def encode_labels(y_train, y_val):
    label_enc = LabelEncoder()
    y_train_enc = label_enc.fit_transform(y_train)
    y_val_enc = label_enc.transform(y_val)
    y_train_cat = to_categorical(y_train_enc)
    y_val_cat = to_categorical(y_val_enc)
    return y_train_cat, y_val_cat, label_enc, y_train_enc, y_val_enc

In [5]:
def create_sequences(X, y, seq_length):
    sequences_X = []
    sequences_y = []
    for i in range(seq_length - 1, len(X)):
        sequences_X.append(X[i - seq_length + 1: i + 1])
        if y is not None:
            sequences_y.append(y[i])
    sequences_X = np.array(sequences_X)
    sequences_y = np.array(sequences_y) if y is not None else None
    return sequences_X, sequences_y

#### Training and evaluvating the model

In [6]:
def build_model(input_shape, num_classes):
    model = Sequential()
    model.add(LSTM(64, input_shape=input_shape))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

def train_model(model, X_train, y_train, X_val, y_val, epochs=50, batch_size=64):
    history = model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, 
                        validation_data=(X_val, y_val), verbose=1)
    return history

def save_model(model, file_path):
    model.save(file_path)

def load_saved_model(file_path):
    return load_model(file_path)

def evaluate_model(model, X_test, y_test_cat, y_test_enc, label_enc, file_name):
    y_pred_probs = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred_probs, axis=1)
    y_true_classes = y_test_enc[len(y_test_enc) - len(y_pred_classes):]  # align sizes if needed
    with open(file_name, "w") as file:
        file.write(classification_report(y_true_classes, y_pred_classes, labels=np.arange(len(label_enc.classes_)), target_names=label_enc.classes_.astype(str)))

#### Creating default directories if not exist already

In [7]:
for path in ['../data/output', '../data/print']:
    if not os.path.exists(path):
        os.makedirs(path)

#### Predicting each city individually

In [None]:
for city in city_keys:
    file_path = f'../data/input/{city}_{years[0]}{months[0]}01_{years[-1]}{months[-1]}01.csv'
    df = pd.read_csv(file_path)
    
    if df.empty:
        print(f"{city} is empty...??")
        continue

    df = parse_time(df)
    train_df, val_df = split_data(df)
    X_train_df = train_df.drop('Type', axis=1)
    y_train_series = train_df['Type']
    X_val_df = val_df.drop('Type', axis=1)
    y_val_series = val_df['Type']
    X_train_scaled, X_val_scaled, scaler = feature_scaling(X_train_df.values, X_val_df.values)
    y_train_cat, y_val_cat, label_enc, y_train_enc, y_val_enc = encode_labels(y_train_series.values, y_val_series.values)
    
    seq_length = 10
    X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_cat, seq_length)
    X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val_cat, seq_length)
    
    model = build_model(input_shape=(seq_length, X_train_seq.shape[2]), num_classes=y_train_cat.shape[1])
    train_model(model, X_train_seq, y_train_seq, X_val_seq, y_val_seq, epochs=50, batch_size=64)
    model_path = f"../data/output/{city}_{years[0]}{months[0]}01_{years[-1]}{months[-1]}01.h5"
    print_path = f"../data/print/{city}_{years[0]}{months[0]}01_{years[-1]}{months[-1]}01.txt"
    save_model(model, model_path)
    evaluate_model(model, X_val_seq, y_val_seq, y_val_enc, label_enc, print_path)

#### Treating all cities as a single entity for comparison

In [None]:
df = []
for city in city_keys:
    file_path = f'../data/input/{city}_{years[0]}{months[0]}01_{years[-1]}{months[-1]}01.csv'
    df.append(pd.read_csv(file_path))

if len(df) == 0:
    print(f"everything is empty...??")
    exit

df = pd.concat(df, ignore_index=True)
df = parse_time(df)
train_df, val_df = split_data(df)
X_train_df = train_df.drop('Type', axis=1)
y_train_series = train_df['Type']
X_val_df = val_df.drop('Type', axis=1)
y_val_series = val_df['Type']
X_train_scaled, X_val_scaled, scaler = feature_scaling(X_train_df.values, X_val_df.values)
y_train_cat, y_val_cat, label_enc, y_train_enc, y_val_enc = encode_labels(y_train_series.values, y_val_series.values)

seq_length = 10
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train_cat, seq_length)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val_cat, seq_length)

model = build_model(input_shape=(seq_length, X_train_seq.shape[2]), num_classes=y_train_cat.shape[1])
train_model(model, X_train_seq, y_train_seq, X_val_seq, y_val_seq, epochs=50, batch_size=64)
model_path = f"../data/output/{years[0]}{months[0]}01_{years[-1]}{months[-1]}01.h5"
print_path = f"../data/print/{years[0]}{months[0]}01_{years[-1]}{months[-1]}01.txt"
save_model(model, model_path)
evaluate_model(model, X_val_seq, y_val_seq, y_val_enc, label_enc, print_path)