In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import io

# ---------------------------------------------------------
# 1. DATA LOADING AND CLEANING
# ---------------------------------------------------------

In [2]:
def load_and_clean_data(csv_path):
    print(">>> Loading Data...")
    # Loading the data (assuming standard CSV format based on your snippet)
    df = pd.read_csv(csv_path)
    
    # Selecting only necessary columns
    df = df[['CUST_ORDER_ID', 'PART_ID', 'LOCATION_RANK']]
    
    # Sort by Order ID and then by Rank to ensure the sequence is correct
    df = df.sort_values(by=['CUST_ORDER_ID', 'LOCATION_RANK'])
    
    # Convert PART_ID to string to ensure consistency
    df['PART_ID'] = df['PART_ID'].astype(str)
    
    print(f"Data loaded: {len(df)} rows.")
    return df

# ---------------------------------------------------------
# 2. DATA PREPROCESSING (Tokenization & Sequence Creation)
# ---------------------------------------------------------

In [3]:
class SequencePreprocessor:
    def __init__(self):
        self.le = LabelEncoder()
        self.vocab_size = 0
        self.max_sequence_len = 0

    def fit_transform(self, df):
        print(">>> Preprocessing sequences...")
        
        # Encode Part IDs to Integers
        df['PART_ID_ENCODED'] = self.le.fit_transform(df['PART_ID'])
        self.vocab_size = len(self.le.classes_) + 1 # +1 for padding index 0
        
        # Group by Order ID to get lists of parts
        order_groups = df.groupby('CUST_ORDER_ID')['PART_ID_ENCODED'].apply(list)
        
        # Generate N-gram sequences
        # If an order is [A, B, C], we create inputs: [A] -> predict B, [A, B] -> predict C
        input_sequences = []
        for sequence in order_groups:
            for i in range(1, len(sequence)):
                n_gram_sequence = sequence[:i+1]
                input_sequences.append(n_gram_sequence)
        
        # Pad sequences to ensure uniform length
        self.max_sequence_len = max([len(x) for x in input_sequences])
        input_sequences = np.array(pad_sequences(input_sequences, maxlen=self.max_sequence_len, padding='pre'))
        
        # Split into X (Features) and y (Target)
        X, y = input_sequences[:, :-1], input_sequences[:, -1]
        
        # One-hot encode the output (y) is typically too memory intensive for large vocabularies.
        # We will use Sparse Categorical Crossentropy loss, so we keep y as integers.
        
        print(f"Vocabulary Size: {self.vocab_size}")
        print(f"Max Sequence Length: {self.max_sequence_len}")
        print(f"Training Samples: {len(X)}")
        
        return X, y


# ---------------------------------------------------------
# 3. MODEL CREATION (LSTM)
# ---------------------------------------------------------

In [4]:
def create_lstm_model(vocab_size, max_seq_len, embedding_dim=64):
    print(">>> Building Model...")
    model = Sequential()
    
    # Embedding Layer: Turns integer Part IDs into dense vectors
    model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_seq_len-1))
    
    # LSTM Layer: Captures the sequence order dependencies
    model.add(LSTM(100, return_sequences=False))
    model.add(Dropout(0.2)) # Prevent overfitting
    
    # Output Layer: Probability distribution over all possible parts
    model.add(Dense(vocab_size, activation='softmax'))
    
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# ---------------------------------------------------------
# 4. IMPLEMENTATION & PREDICTION
# ---------------------------------------------------------

In [5]:
def predict_next_part(model, preprocessor, current_sequence_of_parts):
    # 1. Encode the input list of parts
    encoded_seq = []
    for part in current_sequence_of_parts:
        try:
            encoded_seq.append(preprocessor.le.transform([str(part)])[0])
        except ValueError:
            # Handle unknown parts (parts not seen in training)
            continue
            
    if not encoded_seq:
        return "Unknown Sequence"

    # 2. Pad the sequence
    padded_seq = pad_sequences([encoded_seq], maxlen=preprocessor.max_sequence_len-1, padding='pre')
    
    # 3. Predict
    predicted_probs = model.predict(padded_seq, verbose=0)
    predicted_class = np.argmax(predicted_probs, axis=-1)[0]
    
    # 4. Decode back to Part ID
    predicted_part = preprocessor.le.inverse_transform([predicted_class])[0]
    
    return predicted_part

# ---------------------------------------------------------
# MAIN EXECUTION BLOCK
# ---------------------------------------------------------

In [6]:
if __name__ == "__main__":
    # A. Configuration
    FILE_NAME = 'newTestingData.csv'
    EPOCHS = 10  # Increase this for better accuracy
    BATCH_SIZE = 32

    # B. Run Pipeline
    try:
        # Load
        df = load_and_clean_data(FILE_NAME)
        
        # Preprocess
        processor = SequencePreprocessor()
        X, y = processor.fit_transform(df)
        
        # Model
        model = create_lstm_model(processor.vocab_size, processor.max_sequence_len)
        print(model.summary())
        
        # Train
        print(">>> Training Model...")
        model.fit(X, y, epochs=EPOCHS, batch_size=BATCH_SIZE, verbose=1)
        print(">>> Training Complete.")
        
        # C. Demonstration / Testing
        print("\n>>> Testing Prediction:")
        
        # Let's pick a real order from the data to test: CUST_ORDER_ID '033180'
        # In the data, order 033180 goes: D5345600 -> F6546300 -> E5465900...
        
        test_sequence = ['1030076001', '1050065051'] # Actual Part IDs from the file for order 033180
        print(f"Input Sequence: {test_sequence}")
        
        prediction = predict_next_part(model, processor, test_sequence)
        print(f"Predicted Next Part: {prediction}")
        
        # In the file, the part after those two is '1083312001' (E5465900)
        
    except FileNotFoundError:
        print(f"Error: Make sure '{FILE_NAME}' is in the same directory.")

>>> Loading Data...
Data loaded: 4205 rows.
>>> Preprocessing sequences...
Vocabulary Size: 337
Max Sequence Length: 32
Training Samples: 3428
>>> Building Model...




None
>>> Training Model...
Epoch 1/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 10ms/step - accuracy: 0.0333 - loss: 5.2059
Epoch 2/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0370 - loss: 4.9553
Epoch 3/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.0385 - loss: 4.8792
Epoch 4/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.0508 - loss: 4.7865
Epoch 5/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - accuracy: 0.0668 - loss: 4.6800
Epoch 6/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.0785 - loss: 4.5919
Epoch 7/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0849 - loss: 4.5092
Epoch 8/10
[1m108/108[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0913 - loss: 4.4312
Epoch