In [1]:
import pandas as pd

from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer

In [2]:
from tensorflow import keras
from tensorflow.keras import layers

In [3]:
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [4]:
df = pd.read_csv('dataset_processed_v2.csv.zip')
df.shape

(1286916, 27)

# Merge TS Features

In [5]:
event_level_fill_df = pd.read_csv('event_level_fill_rate.csv')
df = df.merge(event_level_fill_df, on='event_date', how='left')

seat_level_df = pd.read_csv('seat_level_lagged_isattended.csv.zip')
df = df.merge(seat_level_df, on=['event_date', 'SeatUniqueID'], how='left')

In [6]:
target_variable = 'isAttended'

numerical_features = ['Price', 'Tenure', 'event_sale_date_diff', 'sale_resale_date_diff', 'ResalePrice',
                      'event_resale_date_diff', 'fill_rate_lag_1', 'fill_rate_lag_2', 'fill_rate_lag_3',]

categorical_features = ['PC1', 'paid', 'status', 'isHost', 'TicketClass', 'Term', 'Season', 'Resold', 'isSTM',
                       'acct_type_desc_processed', 'plan_event_name_processed', 'comp_name_processed',
                        'class_name_processed', 'TicketType_processed', 'isAttended_lagg_1', 'isAttended_lagg_2',
                        'isAttended_lagg_3', 'SeatType']

In [7]:
for col in categorical_features:
    df[col] = df[col].apply(lambda x: str(x) if pd.notna(x) else None)
    print(col, df[col].nunique())

PC1 31
paid 3
status 4
isHost 2
TicketClass 5
Term 5
Season 2
Resold 1
isSTM 2
acct_type_desc_processed 8
plan_event_name_processed 3
comp_name_processed 2
class_name_processed 11
TicketType_processed 11
isAttended_lagg_1 2
isAttended_lagg_2 2
isAttended_lagg_3 2
SeatType 2


In [8]:
transformer_num = make_pipeline(
    SimpleImputer(strategy="mean"), # there are a few missing values
    StandardScaler(),
)

transformer_cat = make_pipeline(
    SimpleImputer(strategy="constant", fill_value="NA"),
    OneHotEncoder(handle_unknown='ignore'),
)

preprocessor = make_column_transformer(
    (transformer_num, numerical_features),
    (transformer_cat, categorical_features),
)

In [9]:
train_df = df[df['isAttended'] != '???'].copy()
test_df = df[df['isAttended'] == '???'].copy()

In [10]:
train_df['isAttended'] = train_df['isAttended'].replace({'Yes': 1, 'No': 0})

In [11]:
X = train_df[numerical_features + categorical_features]
y = train_df[target_variable]

In [12]:
# stratify - make sure classes are evenlly represented across splits
X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, train_size=0.75, random_state=99)

X_train = preprocessor.fit_transform(X_train)
X_valid = preprocessor.transform(X_valid)

input_shape = [X_train.shape[1]]

In [13]:
X_train

<868671x116 sparse matrix of type '<class 'numpy.float64'>'
	with 23357629 stored elements in Compressed Sparse Row format>

In [16]:
X_train = X_train.toarray()
X_valid = X_valid.toarray()

In [17]:
from tensorflow import keras
from tensorflow.keras import layers

model = keras.Sequential([
    layers.BatchNormalization(input_shape=input_shape),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(256, activation='relu'),
    layers.BatchNormalization(),
    layers.Dropout(0.3),
    layers.Dense(1, activation='sigmoid'),
])

In [18]:
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy'],
)

In [None]:
early_stopping = keras.callbacks.EarlyStopping(
    patience=5,
    min_delta=0.001,
    restore_best_weights=True,
)

history = model.fit(
    X_train, y_train,
    validation_data=(X_valid, y_valid),
    batch_size=512,
    epochs=200,
    callbacks=[early_stopping],
)

In [None]:
# history_df = pd.DataFrame(history.history)
# history_df.loc[:, ['loss', 'val_loss']].plot(title="Cross-entropy")
# history_df.loc[:, ['binary_accuracy', 'val_binary_accuracy']].plot(title="Accuracy")