In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [None]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import LabelEncoder

# Load data
df = pd.read_csv('data_for_LSTM.csv')

# Convert 'islem_tarihi' to datetime format
df['islem_tarihi'] = pd.to_datetime(df['islem_tarihi'])

# Sort transactions by customer ID and transaction date and make sure islem_tarihi is in ascending order
df_sorted = df.sort_values(by=['musteri_id_mask', 'islem_tarihi'])

# Filter data to include customers with at least 10 transactions
counts = df_sorted['musteri_id_mask'].value_counts()
df_filtered = df_sorted[df_sorted['musteri_id_mask'].isin(counts[counts >= 10].index)]

# Encode categories
le = LabelEncoder()
df_filtered['new_category_name_eng_encoded'] = le.fit_transform(df_filtered['new_category_name_eng'])

def create_sequences_with_ids(df, sequence_length=9):
    sequences = []
    labels = []
    customer_ids = []  # To track the customer ID for each sequence
    customers = df['musteri_id_mask'].unique()
    for customer in customers:
        customer_df = df[df['musteri_id_mask'] == customer].copy()
        for i in range(len(customer_df) - sequence_length):
            sequence = customer_df.iloc[i:i+sequence_length]['new_category_name_eng_encoded'].values
            label = customer_df.iloc[i+sequence_length]['new_category_name_eng_encoded']
            sequences.append(sequence)
            labels.append(label)
            customer_ids.append(customer)  # Track the customer ID
    return np.array(sequences), np.array(labels), np.array(customer_ids)

# Generate sequences along with their customer IDs
sequences, labels, customer_ids = create_sequences_with_ids(df_filtered)

# Save the label encoder classes as a npy file
np.save('label_classes.npy', le.classes_)

# Split the data, preserving time order within each customer
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(sequences, labels, groups=customer_ids))

X_train, X_test = sequences[train_idx], sequences[test_idx]
y_train, y_test = labels[train_idx], labels[test_idx]

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).unsqueeze(-1)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).unsqueeze(-1)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_layer_size, output_size):
        super().__init__()
        self.hidden_layer_size = hidden_layer_size
        self.lstm = nn.LSTM(input_size, hidden_layer_size, batch_first=True)
        self.linear = nn.Linear(hidden_layer_size, output_size)

    def forward(self, input_seq):
        lstm_out, _ = self.lstm(input_seq)
        predictions = self.linear(lstm_out[:, -1])
        return predictions

# Instantiate the model
model = LSTMModel(input_size=1, hidden_layer_size=128, output_size=len(le.classes_))
loss_function = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
epochs = 15
for epoch in range(epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = loss_function(y_pred, y_batch)
        loss.backward()
        optimizer.step()
    
    # Validation step (can add more detailed validation metrics)
    model.eval()
    with torch.no_grad():
        total_loss = 0
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            loss = loss_function(y_pred, y_batch)
            total_loss += loss.item()
        print(f'Epoch {epoch+1}, Validation loss: {total_loss / len(test_loader)}')

print("Training completed")

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Ensure the model is in evaluation mode
model.eval()

# Collect all predictions and true labels here
all_predictions = []
all_true_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        # Process each batch
        y_pred = model(X_batch)
        _, predicted_labels = torch.max(y_pred, 1)
        
        all_predictions.extend(predicted_labels.cpu().numpy())
        all_true_labels.extend(y_batch.cpu().numpy())

# Convert lists to NumPy arrays for further analysis
all_predictions = np.array(all_predictions)
all_true_labels = np.array(all_true_labels)

# Calculate metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average="weighted")

# print the metrics first in overall instead of class wise
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Print precision, recall, and F1 score for each class. Also print the class names
precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average=None)
# Assuming `le` is your label encoder and it's defined and fitted to your classes somewhere in your code
for i, (prec, rec, f) in enumerate(zip(precision, recall, f1)):
    print(f"Class {i} - {le.classes_[i]}")  # Correct this line according to how you've defined your classes
    print(f"Precision: {prec}")
    print(f"Recall: {rec}")
    print(f"F1 Score: {f}")

# Garanti subset

In [None]:
data = pd.read_csv('/Users/halilergul/Desktop/thesis_researcg/garantidata/subset_sample.csv')

In [None]:
data.columns

In [None]:
data.MCC_Description.value_counts()

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
# Sort the new data by customer number and transaction date
new_data_sorted = data.sort_values(by=['CUSTOMER_NUM', 'TRN_DATE'])

# Load the previously saved encoder
le = LabelEncoder()
le.classes_ = np.load('label_classes.npy', allow_pickle=True)

# Encode 'MCC_Description' using the loaded encoder
new_data_sorted['MCC_Description_encoded'] = le.transform(new_data_sorted['MCC_Description'])

# Filter to include only customers present in the encoder's classes
filtered_customers = new_data_sorted['CUSTOMER_NUM'].unique()
sequences, labels, customer_ids = [], [], []

# Generate sequences as done previously, but with new column names
for customer in filtered_customers:
    customer_df = new_data_sorted[new_data_sorted['CUSTOMER_NUM'] == customer].copy()
    sequence_length = 9
    for i in range(len(customer_df) - sequence_length):
        sequence = customer_df.iloc[i:i + sequence_length]['MCC_Description_encoded'].values
        label = customer_df.iloc[i + sequence_length]['MCC_Description_encoded']
        sequences.append(sequence)
        labels.append(label)
        customer_ids.append(customer)

In [None]:
# Convert sequences to PyTorch tensors
X_sample_tensor = torch.tensor(sequences, dtype=torch.float32).unsqueeze(-1)

# Create dataloader for the sample
sample_dataset = TensorDataset(X_sample_tensor)
sample_loader = DataLoader(sample_dataset, batch_size=64)

# True labels for the selected sample
y_true = np.array(labels)

# Use the trained model to predict
model.eval()
predictions = []
with torch.no_grad():
    for X_batch in sample_loader:
        y_pred = model(X_batch[0])
        predicted_labels = torch.argmax(y_pred, dim=1)
        predictions.extend(predicted_labels.numpy())

# Calculate metrics
report = classification_report(y_true, predictions, target_names=le.classes_, output_dict=False)
print("Classification Report:\n", report)

# If you want to use the output_dict=True in classification_report to handle metrics programmatically:
metrics_dict = classification_report(y_true, predictions, target_names=le.classes_, output_dict=True)

# Extracting specific metrics
weighted_f1 = metrics_dict['weighted avg']['f1-score']
overall_precision = metrics_dict['weighted avg']['precision']
overall_recall = metrics_dict['weighted avg']['recall']
overall_accuracy = metrics_dict['accuracy']

print(f"Weighted F1 Score: {weighted_f1}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall Accuracy: {overall_accuracy}")

# Detailed metrics for each class
for class_name, class_metrics in metrics_dict.items():
    if class_name not in ['accuracy', 'macro avg', 'weighted avg']:
        print(f"Metrics for {class_name}:")
        print(f"  Precision: {class_metrics['precision']}")
        print(f"  Recall: {class_metrics['recall']}")
        print(f"  F1 Score: {class_metrics['f1-score']}")

# Different Sequence Lengths

In [None]:
# Step 2: Generate Sequences of Different Lengths
# Helper function to generate sequences
def create_test_sequences(df, sequence_length):
    sequences = []
    labels = []
    customer_ids = df['CUSTOMER_NUM'].unique()
    for customer_id in customer_ids:
        customer_df = df[df['CUSTOMER_NUM'] == customer_id]
        for i in range(len(customer_df) - sequence_length):
            sequence = customer_df.iloc[i:i+sequence_length]['MCC_Description_encoded'].values
            label = customer_df.iloc[i+sequence_length]['MCC_Description_encoded']
            sequences.append(sequence)
            labels.append(label)
    return np.array(sequences), np.array(labels)

# Class names mapping, replace class_indices with your actual class indices and names
class_names = {0: 'Clothing', 1: 'Gas stations', 2: 'Grocery', 3: 'Other'}

# Step 2: Generate Sequences of Different Lengths
sequence_lengths = [4, 7, 14]  # Corresponds to desired sequence lengths of 5, 7, and 15


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
for sequence_length in sequence_lengths:
    sequences, labels = create_test_sequences(new_data_sorted, sequence_length)
    
    # Step 3: Convert to Tensors
    X_tensor = torch.tensor(sequences, dtype=torch.float32).unsqueeze(-1)
    y_tensor = torch.tensor(labels, dtype=torch.long)
    
    # Create DataLoader
    test_loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=64, shuffle=False)
    
    # Evaluation
    model.eval()
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            _, predicted_labels = torch.max(y_pred, 1)
            
            all_predictions.extend(predicted_labels.cpu().numpy())
            all_true_labels.extend(y_batch.cpu().numpy())

    # Convert lists to NumPy arrays for analysis
    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)

    # Calculate metrics for each class
    accuracy = accuracy_score(all_true_labels, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average="weighted")

    print(f"Sequence Length: {sequence_length + 1}")
    # print first the overall metrics instead of class wise
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    # Print precision, recall, and F1 score for each class. Also print the class names
    precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average=None)

    # Print precision, recall, and F1 score for each class, including class names
    for i, (prec, rec, f) in enumerate(zip(precision, recall, f1)):
        print(f"{class_names[i]} - Precision: {prec}, Recall: {rec}, F1 Score: {f}")
    print("\n")