In [None]:
#Notebook config
%load_ext autoreload
%autoreload 2

figsize=(14, 4)

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU, Dense, Dropout, Embedding, Masking
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
pd.set_option('display.max_columns', None)
data_folder = os.path.join('../../..', 'data')
file_name = "Sample_0.01_amex_train_data.csv"
label_name = "amex_train_labels.csv"
file_path = os.path.join(data_folder, file_name)
label_path = os.path.join(data_folder, label_name)

In [None]:
data = pd.read_csv(file_path)
labels = pd.read_csv(label_path)
df = data.merge(labels, on="customer_ID", how="left")
df.describe()

In [None]:
df_sorted = df.sort_values(by=["customer_ID", "S_2"])
n = 1
first_n_ids = df_sorted["customer_ID"].unique()[:n]
df_first_n = df_sorted[df_sorted["customer_ID"].isin(first_n_ids)]
print(df_first_n)

In [None]:
numeric_df = df_first_n.select_dtypes(include="number")
sns.heatmap(numeric_df.T, cmap="viridis", cbar=True)
plt.title("Progression of columns")
plt.xlabel("Row index")
plt.ylabel("Columns")
plt.show()

In [None]:
cols = numeric_df.columns.tolist()
chunk_size = 10
for i in range(0, len(cols), chunk_size):
	chunk = cols[i:i+chunk_size]
	for col in chunk:
		plt.plot(numeric_df.index, numeric_df[col], label=col, alpha=0.7)
	plt.xlabel("Row index")
	plt.ylabel("Value")
	plt.title(f"Columns {i+1} to {i+len(chunk)}")
	plt.tight_layout()
	plt.show()

### Vektorisering av data

In [None]:
sorted = data.sort_values(by=["customer_ID", "S_2"])
print(sorted.head(n=3))

In [None]:
sequence_cols = ["customer_ID", "S_2"]
categorical_cols = ["D_63", "D_64"]
feature_cols = [col for col in sorted.columns if col not in sequence_cols]
numerical_cols = [col for col in feature_cols if col not in categorical_cols]

sorted[numerical_cols] = sorted.groupby("customer_ID")[numerical_cols].ffill().fillna(0)

for col in categorical_cols:
	le = LabelEncoder()
	sorted[col] = le.fit_transform(sorted[col])

customer_sequence = {}
for customer_id, group in sorted.groupby("customer_ID"):
	customer_vector_sequence = group[feature_cols].values
	customer_sequence[customer_id] = customer_vector_sequence

customer_sequence

In [None]:
target_dict = dict(zip(labels["customer_ID"], labels["target"]))
target_dict

In [None]:
class SequentialTransactionClassifier:
    def __init__(self, max_sequence_length=100, model_type='lstm'):
        self.max_sequence_length = max_sequence_length
        self.model_type = model_type
        self.model = None
        self.scaler = StandardScaler()
        
    def prepare_data(self, transaction_data, target_data):
        """
        Prepare sequential data for training
        
        Args:
            transaction_data: dict {customer_id: [list of transaction vectors]}
            target_data: dict {customer_id: 0 or 1}
        """
        # Get common customer IDs
        common_ids = list(set(transaction_data.keys()) & set(target_data.keys()))
        
        # Extract sequences and targets
        sequences = []
        targets = []
        
        for cid in common_ids:
            sequence = transaction_data[cid]
            if len(sequence) > 0:  # Only include customers with transactions
                sequences.append(sequence)
                targets.append(target_data[cid])
        
        # Convert to numpy arrays
        targets = np.array(targets)
        
        # Handle different sequence lengths with padding
        if len(sequences[0]) > 0 and isinstance(sequences[0][0], (list, np.ndarray)):
            # Multi-dimensional transaction vectors
            max_len = min(max(len(seq) for seq in sequences), self.max_sequence_length)
            
            # Pad sequences
            padded_sequences = []
            for seq in sequences:
                if len(seq) > max_len:
                    padded_seq = seq[-max_len:]  # Take last max_len transactions
                else:
                    # Pad with zeros
                    pad_length = max_len - len(seq)
                    padded_seq = [[0] * len(seq[0])] * pad_length + seq
                padded_sequences.append(padded_seq)
            
            sequences = np.array(padded_sequences)
            
        else:
            # 1D transaction values
            sequences = pad_sequences(sequences, maxlen=self.max_sequence_length, 
                                    dtype='float32', padding='pre', truncating='pre')
            # Reshape for LSTM input (samples, timesteps, features)
            sequences = sequences.reshape(sequences.shape[0], sequences.shape[1], 1)
        
        return sequences, targets
    
    def build_model(self, input_shape):
        """Build the sequential model"""
        model = Sequential()
        
        # Masking layer to handle padded zeros
        model.add(Masking(mask_value=0.0, input_shape=input_shape))
        
        if self.model_type == 'lstm':
            # LSTM layers
            model.add(LSTM(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
            model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))
        elif self.model_type == 'gru':
            # GRU layers
            model.add(GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))
            model.add(GRU(64, dropout=0.2, recurrent_dropout=0.2))
        elif self.model_type == 'bidirectional':
            # Bidirectional LSTM
            from tensorflow.keras.layers import Bidirectional
            model.add(Bidirectional(LSTM(64, return_sequences=True, dropout=0.2)))
            model.add(Bidirectional(LSTM(32, dropout=0.2)))
        
        # Dense layers
        model.add(Dense(32, activation='relu'))
        model.add(Dropout(0.3))
        model.add(Dense(16, activation='relu'))
        model.add(Dropout(0.2))
        
        # Output layer
        model.add(Dense(1, activation='sigmoid'))
        
        # Compile model
        model.compile(
            optimizer='adam',
            loss='binary_crossentropy',
            metrics=['accuracy', 'precision', 'recall']
        )
        
        return model
    
    def train(self, transaction_data, target_data, validation_split=0.2, epochs=50, batch_size=32):
        """Train the model"""
        # Prepare data
        X, y = self.prepare_data(transaction_data, target_data)
        
        # Split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=0.2, random_state=42, stratify=y
        )
        
        # Build model
        input_shape = (X.shape[1], X.shape[2]) if len(X.shape) == 3 else (X.shape[1], 1)
        self.model = self.build_model(input_shape)
        
        print("Model Architecture:")
        self.model.summary()
        
        # Train model
        history = self.model.fit(
            X_train, y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
            verbose=1,
            callbacks=[
                tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True),
                tf.keras.callbacks.ReduceLROnPlateau(patience=5, factor=0.5)
            ]
        )
        
        # Evaluate on test set
        test_loss, test_acc, test_prec, test_rec = self.model.evaluate(X_test, y_test, verbose=0)
        print(f"\nTest Accuracy: {test_acc:.4f}")
        print(f"Test Precision: {test_prec:.4f}")
        print(f"Test Recall: {test_rec:.4f}")
        
        # Predictions and classification report
        y_pred_proba = self.model.predict(X_test)
        y_pred = (y_pred_proba > 0.5).astype(int).reshape(-1)
        
        print("\nClassification Report:")
        print(classification_report(y_test, y_pred))
        
        return history, X_test, y_test, y_pred
    
    def predict(self, transaction_sequences):
        """Make predictions on new transaction sequences"""
        if self.model is None:
            raise ValueError("Model hasn't been trained yet!")
        
        # Prepare sequences (similar to training data preparation)
        if isinstance(transaction_sequences, dict):
            sequences = list(transaction_sequences.values())
        else:
            sequences = transaction_sequences
        
        # Pad sequences to match training format
        if len(sequences[0]) > 0 and isinstance(sequences[0][0], (list, np.ndarray)):
            # Multi-dimensional
            padded_sequences = []
            for seq in sequences:
                if len(seq) > self.max_sequence_length:
                    padded_seq = seq[-self.max_sequence_length:]
                else:
                    pad_length = self.max_sequence_length - len(seq)
                    padded_seq = [[0] * len(seq[0])] * pad_length + seq
                padded_sequences.append(padded_seq)
            sequences = np.array(padded_sequences)
        else:
            # 1D
            sequences = pad_sequences(sequences, maxlen=self.max_sequence_length, 
                                    dtype='float32', padding='pre', truncating='pre')
            sequences = sequences.reshape(sequences.shape[0], sequences.shape[1], 1)
        
        predictions = self.model.predict(sequences)
        return predictions

In [None]:
classifier = SequentialTransactionClassifier(model_type="lstm")
history, X_test, y_test, y_pred = classifier.train(customer_sequence, target_dict)