In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import re
import matplotlib.pyplot as plt

# Load training and validation datasets
train_data_path = '/content/cleaned_pre_train.csv'
val_data_path = '/content/cleaned_pre_val.csv'

train_df = pd.read_csv(train_data_path)
val_df = pd.read_csv(val_data_path)

# Basic cleaning function
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove non-alphabetic characters
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to both train and validation datasets
train_df['phrases'] = train_df['phrases'].apply(clean_text)
val_df['phrases'] = val_df['phrases'].apply(clean_text)

# Simple character-level tokenizer
def tokenize(text, vocab):
    return [vocab[char] for char in text if char in vocab]

# Build vocabulary (character-level)
all_text = ' '.join(train_df['phrases']) + ' '.join(val_df['phrases'])
vocab = {char: idx for idx, char in enumerate(sorted(set(all_text)), start=1)}
vocab_size = len(vocab) + 1  # For padding

# Encode datasets
train_df['encoded'] = train_df['phrases'].apply(lambda x: tokenize(x, vocab))
val_df['encoded'] = val_df['phrases'].apply(lambda x: tokenize(x, vocab))

# Padding function
def pad_sequences(sequences, max_len):
    return [seq + [0] * (max_len - len(seq)) if len(seq) < max_len else seq[:max_len] for seq in sequences]

# Maximum sequence length
max_len = 128

# Pad sequences
train_encoded = pad_sequences(train_df['encoded'], max_len)
val_encoded = pad_sequences(val_df['encoded'], max_len)

# Custom Dataset class for LSTM
class HinglishDataset(Dataset):
    def __init__(self, encoded_data):
        self.data = torch.tensor(encoded_data, dtype=torch.long)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx, :-1], self.data[idx, 1:]  # Input and target (shifted by 1 for next char prediction)

# Create datasets
train_dataset = HinglishDataset(train_encoded)
val_dataset = HinglishDataset(val_encoded)

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)

# Define the LSTM model
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        output, _ = self.lstm(x)
        output = self.fc(output)
        return output

# Model parameters
embed_size = 128
hidden_size = 256
num_layers = 2
model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers)

# Loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=0)  # Ignore padding index
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
def train_model(model, train_loader, val_loader, epochs=3):
    train_losses = []
    val_losses = []

    for epoch in range(epochs):
        print(f"Starting Epoch {epoch+1}/{epochs}")
        model.train()
        train_loss = 0.0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        avg_train_loss = train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        # Validation phase
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in val_loader:
                outputs = model(inputs)
                loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)
        val_losses.append(avg_val_loss)

        print(f"Epoch {epoch+1}/{epochs}, Training Loss: {avg_train_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")

    return train_losses, val_losses

# Train the model
train_losses, val_losses = train_model(model, train_loader, val_loader, epochs=3)

# Plotting training and validation loss
plt.plot(range(1, 4), train_losses, label='Training Loss')
plt.plot(range(1, 4), val_losses, label='Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.show()

# Save the model
torch.save(model.state_dict(), 'hinglish_lstm_model.pth')
print("Model saved successfully!")

# Save the encoded datasets (train and validation)
train_df.to_csv('/content/train_encoded.csv', index=False)
val_df.to_csv('/content/val_encoded.csv', index=False)
print("Encoded datasets saved successfully!")


Starting Epoch 1/3
Epoch 1/3, Training Loss: 0.9292, Validation Loss: 1.8527
Starting Epoch 2/3
Epoch 2/3, Training Loss: 0.7951, Validation Loss: 1.8021
Starting Epoch 3/3
