<a href="https://colab.research.google.com/github/kbghub56/grocery_store_credit_analysis/blob/main/LSTM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from tensorflow.keras.metrics import AUC
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.preprocessing.sequence import pad_sequences
from imblearn.over_sampling import SMOTE
from google.colab import drive
from sklearn.impute import SimpleImputer
drive.mount('/content/drive') # Needed to access files in drive

Mounted at /content/drive


In [None]:
y_data = pd.read_csv('/content/drive/My Drive/LSTM_analysis/231109_data_cohort3_inclnorcc_y.csv')
x_data = pd.read_csv('/content/drive/My Drive/LSTM_analysis/231109_data_cohort3_inclnorcc_x.csv')

In [None]:
# Convert transaction_date to datetime
x_data['transaction_date'] = pd.to_datetime(x_data['transaction_date'])

# Sort by person_id and transaction_date
x_data.sort_values(by=['person_id', 'transaction_date'], inplace=True)


print(x_data.head)

# Normalize selected numeric features
scaler = MinMaxScaler()
numeric_columns = ['product_item_price_amount', 'product_item_gross_amount']  # Only these two features
x_data[numeric_columns] = scaler.fit_transform(x_data[numeric_columns]) #Scales features based on min/max



In [None]:
# Create sequences
time_steps = 10  # Modify as needed

# Initialize an empty list to store the sequences
sequences = []

# Group the DataFrame by 'person_id'
grouped_data = x_data.groupby('person_id')

# Iterate over each group in the grouped data
for person_id, group in grouped_data:
    # Select only the columns specified in numeric_columns
    numeric_data = group[numeric_columns]

    # Convert the selected data to a NumPy array
    numeric_array = numeric_data.values

    # Append the NumPy array to the list of sequences
    sequences.append(numeric_array)

#Pad sequences to ensure equal length
padded_sequences = pad_sequences(sequences, maxlen=time_steps, padding='post', dtype='float32')

In [None]:
lstm_input = np.array(padded_sequences)
lstm_input = lstm_input.reshape((lstm_input.shape[0], time_steps, -1)) #Reshaping into 3D array, seq, time step, features
print

In [None]:
# Align y_data with X data
y_aligned = y_data.set_index('person_id').loc[x_data['person_id'].unique()].values


In [None]:
# Define the LSTM model
def create_model(input_shape):
    model = Sequential()
    model.add(LSTM(50, return_sequences=True, input_shape=input_shape))
    model.add(LSTM(50))
    model.add(Dense(1, activation='sigmoid'))#Output layer # modify this to return score
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[AUC()])
    return model

In [None]:
# 10-Fold Cross-Validation
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
auc_scores = []

for train_index, test_index in kf.split(lstm_input, y_aligned):
    X_train, X_test = lstm_input[train_index], lstm_input[test_index]
    y_train, y_test = y_aligned[train_index], y_aligned[test_index]

    # Flatten, impute, and apply SMOTE to training data
    X_train_flat = X_train.reshape((X_train.shape[0], -1))
    imputer = SimpleImputer(strategy='median')
    X_train_flat = imputer.fit_transform(X_train_flat)
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_flat, y_train.ravel())
    X_train_smote = X_train_smote.reshape((-1, time_steps, X_train.shape[2]))

    # Create and train the model, training new model for each fold to evaluate preformance
    model = create_model((time_steps, X_train.shape[2]))
    model.fit(X_train_smote, y_train_smote, epochs=10, batch_size=64)

    # Evaluate the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    auc_scores.append(scores[1])  # Assuming AUC is the second metric

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [None]:
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)
auc_scores = []

X_train, X_test = lstm_input[1], lstm_input[1]
y_train, y_test = y_aligned[1], y_aligned[1]

    # Flatten, impute, and apply SMOTE to training data
    X_train_flat = X_train.reshape((X_train.shape[0], -1))
    imputer = SimpleImputer(strategy='median')
    X_train_flat = imputer.fit_transform(X_train_flat)
    smote = SMOTE(random_state=42)
    X_train_smote, y_train_smote = smote.fit_resample(X_train_flat, y_train.ravel())
    X_train_smote = X_train_smote.reshape((-1, time_steps, X_train.shape[2]))

    # Create and train the model, training new model for each fold to evaluate preformance
    model = create_model((time_steps, X_train.shape[2]))
    model.fit(X_train_smote, y_train_smote, epochs=10, batch_size=64)

    # Evaluate the model
    scores = model.evaluate(X_test, y_test, verbose=0)
    auc_scores.append(scores[1])  # Assuming AUC is the second metric

In [None]:
# Calculate average AUC score
average_auc = np.mean(auc_scores)
print(f"Average AUC Score: {average_auc}")

Average AUC Score: 0.5527050971984864
