In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np

In [2]:
dtypes = {
    'elapsed_time': np.int32,
    'event_name': 'category', 
    'name': 'category',
    'level': 'category',
    'room_coor_x': np.float32,
    'room_coor_y': np.float32,
    'screen_coor_x': np.float32,
    'screen_coor_y': np.float32,
    'hover_duration': np.float32,
    'text': 'category',
    'fqid': 'category',
    'room_fqid': 'category',
    'text_fqid': 'category',
    'fullscreen': 'category',
    'hq': 'category',
    'music': 'category',
    'level_group': 'category'
}
df = pd.read_csv('data/train.csv', dtype=dtypes)
print(df.head())
print(df.shape)

          session_id  index  elapsed_time      event_name   name level  page  \
0  20090312431273200      0             0  cutscene_click  basic     0   NaN   
1  20090312431273200      1          1323    person_click  basic     0   NaN   
2  20090312431273200      2           831    person_click  basic     0   NaN   
3  20090312431273200      3          1147    person_click  basic     0   NaN   
4  20090312431273200      4          1863    person_click  basic     0   NaN   

   room_coor_x  room_coor_y  screen_coor_x  screen_coor_y  hover_duration  \
0  -413.991394  -159.314682          380.0          494.0             NaN   
1  -413.991394  -159.314682          380.0          494.0             NaN   
2  -413.991394  -159.314682          380.0          494.0             NaN   
3  -413.991394  -159.314682          380.0          494.0             NaN   
4  -412.991394  -159.314682          381.0          494.0             NaN   

                            text    fqid                

In [3]:
df.set_index(['session_id', 'index'], inplace=True)

In [4]:
df = df[['event_name', 'name', 'level', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']]
for col in ['room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']:
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())
    df[col] = df[col].fillna(0)

In [5]:
import sklearn


class OneHotEncoding(sklearn.base.TransformerMixin):
    def __init__(self, dtypes=None):
        self.input_columns = None
        self.final_columns = None
        if dtypes is None:
            dtypes = [object, 'category']
        self.dtypes = dtypes

    def fit(self, X, y=None, **kwargs):
        self.input_columns = list(X.select_dtypes(self.dtypes).columns)
        X = pd.get_dummies(X, columns=self.input_columns)
        self.final_columns = X.columns
        return self
        
    def transform(self, X, y=None, **kwargs):
        X = pd.get_dummies(X, columns=self.input_columns)
        X_columns = X.columns
        # if columns in X had values not in the data set used during
        # fit add them and set to 0
        missing = set(self.final_columns) - set(X_columns)
        for c in missing:
            X[c] = 0
        # remove any new columns that may have resulted from values in
        # X that were not in the data set when fit
        return X[self.final_columns]
    
    def get_feature_names(self):
        return tuple(self.final_columns)

In [6]:
oneHotEncoding = OneHotEncoding()
df = oneHotEncoding.fit_transform(df)
df.shape

(13174211, 45)

In [7]:
grouped_data = df.groupby('session_id').apply(lambda x: np.array(x))
grouped_data

session_id
20090312431273200    [[0.4850341, 0.519125, 0.19832985, 0.34329396,...
20090312433251036    [[0.49087286, 0.6860453, 0.20824635, 0.1737317...
20090314121766812    [[0.482883, 0.6737315, 0.19467641, 0.18624045,...
20090314363702160    [[0.589824, 0.64226294, 0.3763048, 0.21820709,...
20090314441803444    [[0.49609697, 0.5519618, 0.217119, 0.30993745,...
                                           ...                        
22100215342220508    [[0.5649326, 0.4650811, 0.33402923, 0.39819318...
22100215460321130    [[0.48160797, 0.7014448, 0.20563674, 0.1688672...
22100217104993650    [[0.4819611, 0.65799725, 0.19311064, 0.2022237...
22100219442786200    [[0.48564872, 0.50339067, 0.19937369, 0.359277...
22100221145014656    [[0.42572483, 0.6573131, 0.09759916, 0.2029187...
Length: 11779, dtype: object

In [8]:
from torch.utils.data import Dataset, DataLoader

class MyDataset(Dataset):
    def __init__(self, data):
        self.data = data
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        # Get the numpy array at the given index
        return torch.from_numpy(self.data[idx]).float()

In [9]:
def collate_fn_padd(batch):
    """
    Padds batch of variable length

    Note: it converts things ToTensor manually here since the ToTensor transform
    assume it takes in images rather than arbitrary tensors.
    """
    ## Get sequence lengths
    lengths = [t.shape[0] for t in batch]
    try:
        n_features = batch[0].shape[1]
    except:
        n_features = 1
    max_length = max(lengths)
    if max_length == 0:
        max_length += 1
    batch_size = len(lengths)

    padded_tensor = torch.zeros(batch_size, max_length, n_features, dtype=torch.float32)
    for i, val in enumerate(batch):
        l = lengths[i]
        if n_features == 1:
            padded_tensor[i, :l] = val.reshape(-1, 1)
        else:
            padded_tensor[i, :l] = val
    
    return padded_tensor

In [10]:
dataset = MyDataset(grouped_data.values)

# Create a PyTorch DataLoader
dataloader = DataLoader(dataset, batch_size=32, shuffle=True, collate_fn=collate_fn_padd)

In [11]:
label_df = pd.read_csv('data/train_labels.csv')
label_df['session'] = label_df.session_id.apply(lambda x: int(x.split('_')[0]) )
label_df['question_idx'] = label_df.session_id.apply(lambda x: int(x.split('_')[-1][1:]) )
label_df.drop("session_id", axis=1, inplace=True)
pivoted_questions = label_df.pivot(columns='question_idx', values='correct', index='session')
pivoted_questions['total_score'] = pivoted_questions.iloc[:, 0:18].sum(axis=1)
pivoted_questions.columns = [f'q_{i}' for i in range(1, 19)] + ['total_score']
pivoted_questions

Unnamed: 0_level_0,q_1,q_2,q_3,q_4,q_5,q_6,q_7,q_8,q_9,q_10,q_11,q_12,q_13,q_14,q_15,q_16,q_17,q_18,total_score
session,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
20090312431273200,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,0,1,1,16
20090312433251036,0,1,1,1,0,1,1,0,1,0,0,1,0,1,0,1,0,1,10
20090314121766812,1,1,1,0,0,1,1,0,0,1,1,1,0,1,1,1,0,1,12
20090314363702160,1,1,1,1,1,1,1,1,1,1,1,1,0,1,0,0,1,1,15
20090314441803444,1,1,1,1,1,1,0,1,0,1,0,1,0,1,1,1,1,1,14
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22100215342220508,1,1,1,1,1,1,1,0,1,1,1,1,0,1,1,1,1,1,16
22100215460321130,0,1,1,1,0,1,1,0,1,0,1,1,0,1,0,1,1,1,12
22100217104993650,1,1,1,1,1,1,1,1,1,0,1,1,1,1,0,0,1,1,15
22100219442786200,0,1,1,1,1,1,1,0,1,0,1,1,0,1,0,1,1,1,13


In [12]:
# Define the LSTM model
class StackedLSTM(nn.Module):
    def __init__(self, n_layers, n_hidden, n_features, n_embeddings):
        super(StackedLSTM, self).__init__()
        self.embedding = nn.Linear(n_features, n_embeddings)
        self.lstm = nn.LSTM(n_embeddings, n_hidden, n_layers, batch_first=True)
        self.linear = nn.Linear(n_hidden, 18)
        
    def forward(self, x):
        # Pass the input through the Embedding layer
        embed_out = self.embedding(x)

        # Pass the input through the LSTM layers
        lstm_out, _ = self.lstm(embed_out)

        # Get only the last output of the LSTM layer
        out = lstm_out[:, -1, :]
        
        # Flatten the LSTM output and pass it through the linear layer
        out = self.linear(out)
        
        # Apply sigmoid activation function to the output
        out = torch.sigmoid(out)
        
        return out

# Create an instance of the model
n_layers = 5  # Number of LSTM layers
n_hidden = 32  # Number of LSTM units
n_embeddings = 16 # Number of dimension in embedding layer
n_features = 45  # Number of features in each sequence

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
model = StackedLSTM(n_layers, n_hidden, n_features, n_embeddings).to(device)

In [13]:
from tqdm import tqdm

# Define number of output labels (number of questions)
n_out = 18

# Define the batch size
batch_size = 32

# Define the number of epochs
n_epochs = 50

# Data size
n_samples = len(grouped_data)

# Define the loss function and optimizer
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

# Train the model
model.train()
for epoch in range(n_epochs):
    for i, sample in tqdm(enumerate(dataloader)):
        model.zero_grad()
        
        # Get label
        labels = torch.from_numpy(pivoted_questions.iloc[i*batch_size:(i+1)*batch_size, :18].values).float()
        
        sample = sample.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(sample)

        # Compute the loss
        loss = criterion(outputs, labels)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        sample = sample.to('cpu')
        labels = labels.to('cpu')
        
    # Print the loss after every epoch
    print(f'Epoch {epoch+1}/{n_epochs}, Loss: {loss.item():.4f}')

369it [02:36,  2.35it/s]


Epoch 1/50, Loss: 0.5515


369it [02:34,  2.38it/s]


Epoch 2/50, Loss: 0.5523


369it [02:35,  2.38it/s]


Epoch 3/50, Loss: 0.5525


369it [02:34,  2.39it/s]


Epoch 4/50, Loss: 0.5525


369it [02:34,  2.38it/s]


Epoch 5/50, Loss: 0.5525


369it [02:33,  2.40it/s]


Epoch 6/50, Loss: 0.5529


369it [02:30,  2.45it/s]


Epoch 7/50, Loss: 0.5522


369it [02:31,  2.44it/s]


Epoch 8/50, Loss: 0.5523


369it [02:31,  2.44it/s]


Epoch 9/50, Loss: 0.5523


369it [02:30,  2.46it/s]


Epoch 10/50, Loss: 0.5523


369it [02:30,  2.46it/s]


Epoch 11/50, Loss: 0.5522


369it [02:30,  2.46it/s]


Epoch 12/50, Loss: 0.5523


369it [02:31,  2.44it/s]


Epoch 13/50, Loss: 0.5523


369it [02:30,  2.45it/s]


Epoch 14/50, Loss: 0.5523


369it [02:31,  2.43it/s]


Epoch 15/50, Loss: 0.5523


369it [02:33,  2.40it/s]


Epoch 16/50, Loss: 0.5523


369it [02:32,  2.42it/s]


Epoch 17/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 18/50, Loss: 0.5523


369it [02:34,  2.39it/s]


Epoch 19/50, Loss: 0.5522


369it [02:33,  2.40it/s]


Epoch 20/50, Loss: 0.5522


369it [02:32,  2.42it/s]


Epoch 21/50, Loss: 0.5522


369it [02:31,  2.43it/s]


Epoch 22/50, Loss: 0.5522


369it [02:35,  2.37it/s]


Epoch 23/50, Loss: 0.5522


369it [02:32,  2.42it/s]


Epoch 24/50, Loss: 0.5522


369it [02:33,  2.40it/s]


Epoch 25/50, Loss: 0.5522


369it [02:32,  2.42it/s]


Epoch 26/50, Loss: 0.5523


369it [02:32,  2.41it/s]


Epoch 27/50, Loss: 0.5523


369it [02:32,  2.42it/s]


Epoch 28/50, Loss: 0.5523


369it [02:34,  2.38it/s]


Epoch 29/50, Loss: 0.5523


369it [02:31,  2.44it/s]


Epoch 30/50, Loss: 0.5523


369it [02:31,  2.44it/s]


Epoch 31/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 32/50, Loss: 0.5523


369it [02:33,  2.40it/s]


Epoch 33/50, Loss: 0.5523


369it [02:35,  2.38it/s]


Epoch 34/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 35/50, Loss: 0.5523


369it [02:33,  2.40it/s]


Epoch 36/50, Loss: 0.5523


369it [02:31,  2.43it/s]


Epoch 37/50, Loss: 0.5523


369it [02:32,  2.42it/s]


Epoch 38/50, Loss: 0.5523


369it [02:33,  2.40it/s]


Epoch 39/50, Loss: 0.5523


369it [02:33,  2.40it/s]


Epoch 40/50, Loss: 0.5523


369it [02:32,  2.42it/s]


Epoch 41/50, Loss: 0.5523


369it [02:32,  2.42it/s]


Epoch 42/50, Loss: 0.5523


369it [02:34,  2.39it/s]


Epoch 43/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 44/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 45/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 46/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 47/50, Loss: 0.5523


369it [02:33,  2.40it/s]


Epoch 48/50, Loss: 0.5523


369it [02:33,  2.41it/s]


Epoch 49/50, Loss: 0.5523


369it [02:30,  2.45it/s]

Epoch 50/50, Loss: 0.5523





In [14]:
# Evaluate the model
pred_list = []
true_list = []

model.eval()
for i, sample in tqdm(enumerate(dataloader)):
    model.zero_grad()
        
    # Get label
    labels = torch.from_numpy(pivoted_questions.iloc[i*batch_size:(i+1)*batch_size, :18].values).float()
    
    sample = sample.to(device)
    labels = labels.to(device)

    # Forward pass
    outputs = model(sample)

    sample = sample.to('cpu')
    labels = labels.to('cpu')

    pred_list.append(outputs.data.cpu().numpy())
    true_list.append(labels.data.cpu().numpy())

369it [00:53,  6.85it/s]


In [15]:
test_pred_flattened = np.concatenate(pred_list).ravel()
test_true_flattened = np.concatenate(true_list).ravel()

In [16]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

print(accuracy_score(test_true_flattened, np.round(test_pred_flattened)))
print(precision_score(test_true_flattened, np.round(test_pred_flattened)))
print(recall_score(test_true_flattened, np.round(test_pred_flattened)))

0.7313392006489893
0.7448478223957891
0.9405368102269986


In [17]:
# For test set

# Remove the training set to save RAM
del(df)
del(grouped_data)
del(dataloader)
del(dataset)

In [None]:
test_df = pd.read_csv('data/test.csv', dtype=dtypes)
test_df.set_index(['session_id', 'index'], inplace=True)

test_df = test_df[['event_name', 'name', 'level', 'room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']]
for col in ['room_coor_x', 'room_coor_y', 'screen_coor_x', 'screen_coor_y', 'hover_duration']:
    # Scaling the coordinates and durations
    test_df[col] = (test_df[col] - test_df[col].min()) / (test_df[col].max() - test_df[col].min())
    test_df[col] = test_df[col].fillna(0)
    
test_df = oneHotEncoding.transform(test_df)
grouped_data = test_df.groupby('session_id').apply(lambda x: np.array(x))

dataset = MyDataset(grouped_data.values)
dataloader = DataLoader(dataset, batch_size=3, shuffle=True, collate_fn=collate_fn_padd)

# Make predictions
pred_list = []

model.eval()
for i, sample in tqdm(enumerate(dataloader)):
    model.zero_grad()
    sample = sample.to(device)
    # Forward pass
    outputs = model(sample)
    sample = sample.to('cpu')
    pred_list.append(outputs.data.cpu().numpy())
    
pred_flattened = np.concatenate(pred_list).ravel()
session_ids = test_df.index.get_level_values('session_id').unique().tolist()

from functools import reduce
session_ids = reduce(lambda x, y: x + [f'{y}_q{i}' for i in range(1, 19)], session_ids, [])

test_result = pd.DataFrame({
    'session_id': session_ids,
    'correct': (pred_flattened > 0.6).astype('int')
})
test_result.head()

1it [00:00,  8.57it/s]