In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [None]:
# Load training and validation datasets
train_data = pd.read_csv('cleaned_pre_train.csv')
val_data = pd.read_csv('cleaned_pre_val.csv')

# Display the first few rows
print("Train Data:")
print(train_data.head())

print("\nValidation Data:")
print(val_data.head())


Train Data:
                                             phrases
0                               film ka kya naam hai
1  namaste sada hua tomatoes score mahaan hai lek...
2  kya aapako lagata hai ki aapako film pasand aa...
3                        yah kis tarah kee philm hai
4                                film kab banee thee

Validation Data:
                                             phrases
0  snow may fall for parts of cny the next few da...
1  kaale megha kaale megha paani toh barsao megha...
2                            sallu bhai kaha ho thum
3     guessthesong aao kho jaaye sitaaron mein kahin
4  chidambaram ka kuch nahi ukhaad paayenge becua...


In [None]:
import re

# Basic cleaning function
def clean_text(text):
    # Convert to lowercase (if using uncased model)
    text = text.lower()

    # Remove non-alphabetic characters (except for spaces)
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alphabetic characters and spaces

    # Remove alphanumeric words (words that contain both letters and numbers)
    text = re.sub(r'\b\w*\d\w*\b', '', text)  # Remove words that contain numbers

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply cleaning function to train and validation data and update the "phrases" column
train_data['phrases'] = train_data['phrases'].apply(clean_text)
val_data['phrases'] = val_data['phrases'].apply(clean_text)

# Verify the changes
print(train_data['phrases'].head())
print(val_data['phrases'].head())

0                                 film ka kya naam hai
1    namaste sada hua tomatoes score mahaan hai lek...
2    kya aapako lagata hai ki aapako film pasand aa...
3                          yah kis tarah kee philm hai
4                                  film kab banee thee
Name: phrases, dtype: object
0    snow may fall for parts of cny the next few da...
1    kaale megha kaale megha paani toh barsao megha...
2                              sallu bhai kaha ho thum
3       guessthesong aao kho jaaye sitaaron mein kahin
4    chidambaram ka kuch nahi ukhaad paayenge becua...
Name: phrases, dtype: object


In [None]:
# Combine text into lists
train_phrases = train_data['phrases'].astype(str).tolist()
val_phrases = val_data['phrases'].astype(str).tolist()

# Tokenizer setup
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_phrases)

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 38201


In [None]:
# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(train_phrases)
val_sequences = tokenizer.texts_to_sequences(val_phrases)

# Example of tokenized sequences
print("Example of tokenized sequences:")
print(train_sequences[:5])


Example of tokenized sequences:
[[625, 14, 5, 279, 3], [3102, 6124, 306, 1351, 1340, 3800, 3, 441, 13203, 4604, 1340, 18, 18576, 1079, 1393, 13, 287, 319, 1051, 3], [5, 2341, 1051, 3, 12, 2341, 625, 360, 18577], [462, 290, 437, 1079, 1393, 3], [625, 64, 13204, 2076]]


In [None]:
# Function to create input-output pairs for next-word prediction
def create_sequences(data, seq_length=10):
    input_sequences = []
    labels = []
    for line in data:
        for i in range(1, len(line)):
            input_sequences.append(line[:i])
            labels.append(line[i])
    return input_sequences, labels

# Generate pairs
train_input, train_labels = create_sequences(train_sequences)
val_input, val_labels = create_sequences(val_sequences)


In [None]:
# Pad input sequences to have uniform length
max_sequence_length = 10  # Set the sequence length
train_input = pad_sequences(train_input, maxlen=max_sequence_length, padding='pre')
val_input = pad_sequences(val_input, maxlen=max_sequence_length, padding='pre')

print("Padded Input Example:")
print(train_input[:5])


Padded Input Example:
[[   0    0    0    0    0    0    0    0    0  625]
 [   0    0    0    0    0    0    0    0  625   14]
 [   0    0    0    0    0    0    0  625   14    5]
 [   0    0    0    0    0    0  625   14    5  279]
 [   0    0    0    0    0    0    0    0    0 3102]]


In [None]:
# One-hot encode the output labels
train_labels = tf.keras.utils.to_categorical(train_labels, num_classes=vocab_size)
val_labels = tf.keras.utils.to_categorical(val_labels, num_classes=vocab_size)

print("One-hot encoded label example:")
print(train_labels[:5])


One-hot encoded label example:
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
# Define an LSTM-based model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()




In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout


In [None]:
# Tokenizer setup
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_phrases)  # Ensure 'train_phrases' is defined as the list of training sentences

# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # Adding 1 for padding
print(f"Vocabulary Size: {vocab_size}")


Vocabulary Size: 38201


In [None]:
# Define an LSTM-based model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=max_sequence_length),
    LSTM(128, return_sequences=True),
    Dropout(0.2),
    LSTM(128),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Summary of the model
model.summary()


In [None]:
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=50, input_length=max_sequence_length),  # Reduce embedding size
    LSTM(64, return_sequences=True),  # Reduce LSTM units
    Dropout(0.2),
    LSTM(64),  # Reduce LSTM units
    Dropout(0.2),
    Dense(64, activation='relu'),  # Reduce Dense layer units
    Dense(vocab_size, activation='softmax')
])


In [None]:
# Save the trained model
model.save('hinglish_auto_suggestion_model.h5')

# Save the tokenizer
with open('tokenizer.json', 'w') as f:
    f.write(tokenizer.to_json())

print("Model and tokenizer saved successfully!")




Model and tokenizer saved successfully!
