In [None]:
# Install TensorFlow
!pip install tensorflow

import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Load Dataset from CSV File
def load_data_from_csv(csv_file_path):
    data = pd.read_csv(csv_file_path)
    print("Columns in dataset:", data.columns)  # Print columns for debugging

    # Ensure all values in 'IDENTITY' column are strings and handle missing values
    if 'IDENTITY' not in data.columns:
        raise KeyError("Column 'IDENTITY' not found in dataset.")

    return data['IDENTITY'].fillna('').astype(str).tolist()

# Preprocess Data
def preprocess_data(names):
    tokenizer = Tokenizer(char_level=True)
    tokenizer.fit_on_texts(names)
    sequences = tokenizer.texts_to_sequences(names)
    max_length = max(len(name) for name in names)
    padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')
    return padded_sequences, len(tokenizer.word_index) + 1, tokenizer

# Build LSTM Model
def build_model(input_length, total_chars):
    model = Sequential()
    model.add(Embedding(input_dim=total_chars, output_dim=50, input_length=input_length))
    model.add(LSTM(128))
    model.add(Dense(total_chars, activation='softmax'))
    return model

# Main Program
if __name__ == "__main__":
    # Load data from CSV file (update path accordingly)
    csv_file_path = '/written_name_test_v2.csv'  # Replace with your CSV file path

    try:
        # Load names
        names = load_data_from_csv(csv_file_path)

        # Preprocess data
        padded_sequences, total_chars, tokenizer = preprocess_data(names)

        # Prepare input and output for training
        X = padded_sequences[:, :-1]  # All but last character as input
        y = padded_sequences[:, 1:]   # All but first character as output

        # Convert y to integer class labels for sparse_categorical_crossentropy
        y_train = np.argmax(y, axis=-1)  # Convert to integer labels

        # Split into training and validation sets
        X_train, X_val, y_train, y_val = train_test_split(X, y_train, test_size=0.2, random_state=42)

        # Build and compile the model
        model = build_model(X_train.shape[1], total_chars)
        model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        # Train the model without expanding dimensions on y_train and y_val
        model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10)

        # Evaluate the model on validation set
        loss, accuracy = model.evaluate(X_val, y_val)
        print(f'Validation Accuracy: {accuracy * 100:.2f}%')

    except KeyError as e:
        print(f"Error: {e}. Please check your dataset and ensure it contains a column named 'IDENTITY'.")

Columns in dataset: Index(['FILENAME', 'IDENTITY'], dtype='object')
Epoch 1/10




[1m1035/1035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 38ms/step - accuracy: 0.3334 - loss: 1.7592 - val_accuracy: 0.4884 - val_loss: 1.2463
Epoch 2/10
[1m1035/1035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 39ms/step - accuracy: 0.6439 - loss: 0.9467 - val_accuracy: 0.8714 - val_loss: 0.4099
Epoch 3/10
[1m1035/1035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.8786 - loss: 0.3660 - val_accuracy: 0.9361 - val_loss: 0.1908
Epoch 4/10
[1m1035/1035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 38ms/step - accuracy: 0.9371 - loss: 0.2008 - val_accuracy: 0.9645 - val_loss: 0.1238
Epoch 5/10
[1m1035/1035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 40ms/step - accuracy: 0.9616 - loss: 0.1275 - val_accuracy: 0.9653 - val_loss: 0.1168
Epoch 6/10
[1m1035/1035[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 39ms/step - accuracy: 0.9656 - loss: 0.1143 - val_accuracy: 0.9688 - val_loss: 0.1029
Epoch 7/10
[1m