In [None]:
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('data_for_LSTM.csv')

In [None]:
df.head()

# Baseline (averagin)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder

# Assuming 'df' is your DataFrame

# Ensure 'islem_tarihi' is a datetime column
df['islem_tarihi'] = pd.to_datetime(df['islem_tarihi'])

# Sort transactions by customer ID and transaction date
df_sorted = df.sort_values(by=['musteri_id_mask', 'islem_tarihi'])

# Filter customers with at least 14 transactions
counts = df_sorted['musteri_id_mask'].value_counts()
df_filtered = df_sorted[df_sorted['musteri_id_mask'].isin(counts[counts >= 14].index)]

# Encode categories for easier processing
encoder = LabelEncoder()
df_filtered['category_encoded'] = encoder.fit_transform(df_filtered['new_category_name_eng'])

# Determine the period of aggregation (e.g., month)
df_filtered['period'] = df_filtered['islem_tarihi'].dt.to_period("M")

# Get the 14th purchase category for each customer
df_filtered['transaction_order'] = df_filtered.groupby('musteri_id_mask').cumcount() + 1
actual_categories = df_filtered[df_filtered['transaction_order'] == 14]

# Aggregate transactions
aggregated_purchases = df_filtered[df_filtered['transaction_order'] < 14].groupby(['musteri_id_mask', 'period', 'category_encoded']).size().unstack(fill_value=0)

# Determine if a purchase was made in each category (>0)
purchases_indicator = aggregated_purchases > 0

# Calculate the probability of purchase in each category for each client
probability_of_purchase = purchases_indicator.groupby('musteri_id_mask').mean()

# Predict the category with the highest probability for each client
predicted_categories = probability_of_purchase.idxmax(axis=1)

# Map predicted categories to actual categories for evaluation
actual_encoded = actual_categories.set_index('musteri_id_mask')['category_encoded']
predicted_categories_filtered = predicted_categories[actual_encoded.index]  # Ensure alignment of indices

# Evaluate predictions
accuracy = accuracy_score(actual_encoded, predicted_categories_filtered)
precision, recall, f1, _ = precision_recall_fscore_support(actual_encoded, predicted_categories_filtered, average='weighted', zero_division=0)

print(f"Baseline Method Evaluation")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# CNN

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import numpy as np
import torch.nn.functional as F  #

In [None]:
import pandas as pd
#read vsv data
df = pd.read_csv('data_for_LSTM.csv')
# Convert 'islem_tarihi' to datetime format if it's not already
df['islem_tarihi'] = pd.to_datetime(df['islem_tarihi'])

# Sort transactions by customer and transaction date
df_sorted = df.sort_values(by=['musteri_id_mask', 'islem_tarihi'])

# Filter data to include customers with at least 10 transactions
counts = df_sorted['musteri_id_mask'].value_counts()
df_filtered = df_sorted[df_sorted['musteri_id_mask'].isin(counts[counts >= 10].index)]

# Encode categories
le = LabelEncoder()
df_filtered['new_category_name_eng_encoded'] = le.fit_transform(df_filtered['new_category_name_eng'])

def create_sequences_with_ids(df, sequence_length=9):
    sequences = []
    labels = []
    customer_ids = []  # To track the customer ID for each sequence
    customers = df['musteri_id_mask'].unique()
    for customer in customers:
        customer_df = df[df['musteri_id_mask'] == customer].copy()
        for i in range(len(customer_df) - sequence_length):
            sequence = customer_df.iloc[i:i+sequence_length]['new_category_name_eng_encoded'].values
            label = customer_df.iloc[i+sequence_length]['new_category_name_eng_encoded']
            sequences.append(sequence)
            labels.append(label)
            customer_ids.append(customer)  # Track the customer ID
    return np.array(sequences), np.array(labels), np.array(customer_ids)

# Generate sequences along with their customer IDs
sequences, labels, customer_ids = create_sequences_with_ids(df_filtered)

In [None]:
# Reshape data for CNN: Assuming sequences.shape is (num_samples, sequence_length)
# For CNN, we need to add an additional dimension to mimic 'channels' in images
X_cnn = sequences.reshape(sequences.shape[0], 1, sequences.shape[1])

In [None]:
# Split the data (ensure this matches the sequence shape expected by the CNN)
X_train, X_test, y_train, y_test = train_test_split(X_cnn, labels, test_size=0.2, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create dataloaders
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Define a basic CNN model
class CNNWithGlobalPooling(nn.Module):
    def __init__(self, num_classes):
        super(CNNWithGlobalPooling, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.global_pool = nn.AdaptiveAvgPool1d(1)  # Global Average Pooling
        self.fc = nn.Linear(64, num_classes)
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = F.relu(self.conv2(x))
        x = self.global_pool(x)
        x = x.view(x.size(0), -1)  # Flatten the output for the fully connected layer
        x = self.fc(x)
        return x

num_classes = len(np.unique(labels))  # Assuming 'labels' are already defined

# Instantiate the model
model = CNNWithGlobalPooling(num_classes=num_classes)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Adjusted training loop to accommodate the new model architecture
epochs = 5
for epoch in range(epochs):
    model.train()
    for i, (X_batch, y_batch) in enumerate(train_loader):
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import numpy as np
import torch
from torch.utils.data import DataLoader

# Assuming the existence of 'test_loader' and a PyTorch model 'model'
class_names = {0: 'Clothing', 1: 'Gas stations', 2: 'Grocery', 3: "Other"}
model.eval()

all_predictions = []
all_true_labels = []

with torch.no_grad():
    for X_batch, y_batch in test_loader:
        y_pred = model(X_batch)
        _, predicted_labels = torch.max(y_pred, 1)
        
        all_predictions.extend(predicted_labels.cpu().numpy())
        all_true_labels.extend(y_batch.cpu().numpy())

all_predictions = np.array(all_predictions)
all_true_labels = np.array(all_true_labels)

# Calculate overall metrics
accuracy = accuracy_score(all_true_labels, all_predictions)
precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average='weighted')

print(f"Overall Metrics (Weighted Average):")
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision_weighted}")
print(f"Recall: {recall_weighted}")
print(f"F1 Score: {f1_weighted}")

# Calculate class-wise metrics
precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average=None)

# Print class-wise metrics
for i, (prec, rec, f) in enumerate(zip(precision, recall, f1)):
    print(f"Class {i} - {class_names[i]}: Precision: {prec}, Recall: {rec}, F1 Score: {f}")


# CNN Garanti

In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

# Load the new data
new_data = pd.read_csv('/Users/halilergul/Desktop/thesis_researcg/garantidata/subset_sample.csv')

# Convert 'TRN_DATE' to datetime format
new_data['TRN_DATE'] = pd.to_datetime(new_data['TRN_DATE'])

# Sort the new data by customer number and transaction date
new_data_sorted = new_data.sort_values(by=['CUSTOMER_NUM', 'TRN_DATE'])

# Load the previously saved encoder
le = LabelEncoder()
le.classes_ = np.load('label_classes.npy', allow_pickle=True)

# Encode 'MCC_Description' using the loaded encoder
new_data_sorted['MCC_Description_encoded'] = le.transform(new_data_sorted['MCC_Description'])

# Filter to include only customers present in the encoder's classes
filtered_customers = new_data_sorted['CUSTOMER_NUM'].unique()
sequences, labels, customer_ids = [], [], []

# Generate sequences as done previously, but with new column names
for customer in filtered_customers:
    customer_df = new_data_sorted[new_data_sorted['CUSTOMER_NUM'] == customer].copy()
    sequence_length = 9
    for i in range(len(customer_df) - sequence_length):
        sequence = customer_df.iloc[i:i + sequence_length]['MCC_Description_encoded'].values
        label = customer_df.iloc[i + sequence_length]['MCC_Description_encoded']
        sequences.append(sequence)
        labels.append(label)
        customer_ids.append(customer)

In [None]:
# Convert sequences to PyTorch tensors
X_sample_tensor = torch.tensor(sequences, dtype=torch.float32).unsqueeze(1)  # Add channel dimension for Conv1d

# Create dataloader for the sample
sample_dataset = TensorDataset(X_sample_tensor)
sample_loader = DataLoader(sample_dataset, batch_size=64)

# True labels for the selected sample
y_true = np.array(labels)

# Use the trained model to predict
model.eval()
predictions = []
with torch.no_grad():
    for X_batch in sample_loader:
        y_pred = model(X_batch[0])
        predicted_labels = torch.argmax(y_pred, dim=1)
        predictions.extend(predicted_labels.numpy())

# Calculate metrics
report = classification_report(y_true, predictions, target_names=le.classes_, output_dict=False)
print("Classification Report:\n", report)

# If you want to use the output_dict=True in classification_report to handle metrics programmatically:
metrics_dict = classification_report(y_true, predictions, target_names=le.classes_, output_dict=True)

# Extracting specific metrics
weighted_f1 = metrics_dict['weighted avg']['f1-score']
overall_precision = metrics_dict['weighted avg']['precision']
overall_recall = metrics_dict['weighted avg']['recall']
overall_accuracy = metrics_dict['accuracy']

print(f"Weighted F1 Score: {weighted_f1}")
print(f"Overall Precision: {overall_precision}")
print(f"Overall Recall: {overall_recall}")
print(f"Overall Accuracy: {overall_accuracy}")

# Detailed metrics for each class
for class_name, class_metrics in metrics_dict.items():
    if class_name not in ['accuracy', 'macro avg', 'weighted avg']:
        print(f"Metrics for {class_name}:")
        print(f"  Precision: {class_metrics['precision']}")
        print(f"  Recall: {class_metrics['recall']}")
        print(f"  F1 Score: {class_metrics['f1-score']}")


# Different Lengths

In [None]:
# Step 2: Generate Sequences of Different Lengths
# Helper function to generate sequences
def create_test_sequences(df, sequence_length):
    sequences = []
    labels = []
    customer_ids = df['CUSTOMER_NUM'].unique()
    for customer_id in customer_ids:
        customer_df = df[df['CUSTOMER_NUM'] == customer_id]
        for i in range(len(customer_df) - sequence_length):
            sequence = customer_df.iloc[i:i+sequence_length]['MCC_Description_encoded'].values
            label = customer_df.iloc[i+sequence_length]['MCC_Description_encoded']
            sequences.append(sequence)
            labels.append(label)
    return np.array(sequences), np.array(labels)

# Class names mapping, replace class_indices with your actual class indices and names
class_names = {0: 'Clothing', 1: 'Gas stations', 2: 'Grocery', 3: 'Other'}

# Step 2: Generate Sequences of Different Lengths
sequence_lengths = [4, 7, 14]  # Corresponds to desired sequence lengths of 5, 7, and 15


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

for sequence_length in sequence_lengths:
    sequences, labels = create_test_sequences(new_data_sorted, sequence_length)
    
    # Step 3: Convert to Tensors
    X_tensor = torch.tensor(sequences, dtype=torch.float32).unsqueeze(1)  # Add channel dimension for Conv1d
    y_tensor = torch.tensor(labels, dtype=torch.long)
    
    # Create DataLoader
    test_loader = DataLoader(TensorDataset(X_tensor, y_tensor), batch_size=64, shuffle=False)
    
    # Evaluation
    model.eval()
    all_predictions = []
    all_true_labels = []

    with torch.no_grad():
        for X_batch, y_batch in test_loader:
            y_pred = model(X_batch)
            _, predicted_labels = torch.max(y_pred, 1)
            
            all_predictions.extend(predicted_labels.cpu().numpy())
            all_true_labels.extend(y_batch.cpu().numpy())

    # Convert lists to NumPy arrays for analysis
    all_predictions = np.array(all_predictions)
    all_true_labels = np.array(all_true_labels)

    # Calculate metrics for each class
    accuracy = accuracy_score(all_true_labels, all_predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average="weighted")

    print(f"Sequence Length: {sequence_length + 1}")
    # Print first the overall metrics instead of class wise
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")

    # Print precision, recall, and F1 score for each class, including class names
    precision, recall, f1, _ = precision_recall_fscore_support(all_true_labels, all_predictions, average=None)

    # Print precision, recall, and F1 score for each class, including class names
    for i, (prec, rec, f) in enumerate(zip(precision, recall, f1)):
        print(f"{class_names[i]} - Precision: {prec}, Recall: {rec}, F1 Score: {f}")
    print("\n")
