<a href="https://colab.research.google.com/github/himanshudas13/AutoComplete/blob/main/SmartCompose.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
projjal1_human_conversation_training_data_path = kagglehub.dataset_download('projjal1/human-conversation-training-data')

print('Data source import complete.')


In [None]:
import re

In [None]:
with open('/kaggle/input/human-conversation-training-data/human_chat.txt', 'r') as file:
    data = file.readlines()

cleaned_lines = [re.sub(r'^Human [12]:\s*', '', line).strip() for line in data if line.strip()]
with open('text_data.txt', 'w') as file:
    file.write('\n'.join(cleaned_lines))

print("Sample cleaned data:")
print(cleaned_lines[:5])


Sample cleaned data:
['Hi!', 'What is your favorite holiday?', 'one where I get to meet lots of different people.', 'What was the most number of people you have ever met during a holiday?', 'Hard to keep a count. Maybe 25.']


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Load the cleaned dataset
with open('/kaggle/working/text_data.txt', 'r') as file:
    cleaned_lines = file.readlines()

tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned_lines)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(cleaned_lines)

print(sequences)



[[25], [13, 9, 26, 119, 309], [72, 87, 1, 71, 3, 224, 277, 7, 208, 96], [13, 32, 4, 143, 654, 7, 96, 2, 15, 132, 398, 259, 5, 309], [186, 3, 399, 5, 655, 172, 1328], [107, 309, 32, 11], [1, 34, 6, 32, 400], [10, 2, 209, 278, 3, 4, 96, 2, 398], [30, 53, 4, 1329, 17, 334, 878, 656, 19, 24, 879, 3, 368, 87, 96, 17, 260, 80, 8, 13, 880, 3, 88], [158, 44, 46, 1, 144, 16, 657, 240, 1330, 881, 12, 1331, 7, 2, 8, 882, 2, 86, 1332, 3, 883, 29, 88, 12, 11, 529, 12, 884, 658, 659, 10, 2, 132, 144, 16, 2, 73, 139, 22, 881, 1333, 88, 1334, 2], [13, 10, 2, 310], [1, 34, 24, 16, 5, 1335, 401, 240, 145, 54, 187, 1336, 3, 96, 19, 51, 28, 1337, 885, 1, 210, 1338, 161, 3, 5, 402, 12, 5, 453, 8, 1, 1339, 5, 886, 12, 23, 188, 1, 1340, 4, 660, 69, 403, 92, 5, 886, 6, 530, 70, 11, 403, 61, 12, 4, 1341, 661, 8, 531, 403, 887, 5, 888, 454, 1342, 42, 5, 532, 1, 92, 108, 398, 4, 402, 114, 8, 108, 279, 311, 404], [97, 60, 146, 1343, 1344], [280, 40, 261, 211, 662, 663, 60, 127, 7, 5, 62, 162, 2, 163, 12, 5, 1345,

In [None]:
# Vocabulary size
vocab_size = len(tokenizer.word_index) + 1
print(f"Vocabulary size: {vocab_size}")




Vocabulary size: 2809


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Prepare input-output pairs for training
input_sequences = []
for seq in sequences:
    for i in range(1, len(seq)):
        input_sequences.append(seq[:i+1])

max_length = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_length, padding='pre')




In [None]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [None]:
# One-hot encode the labels
from tensorflow.keras.utils import to_categorical
y = to_categorical(y, num_classes=vocab_size)

print(f"Shape of X: {X.shape}, Shape of y: {y.shape}")

Shape of X: (18070, 150), Shape of y: (18070, 2809)


In [None]:
# Idea behind architecture:
# Embedding->LSTM->Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define model architecture
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=128, input_length=max_length - 1),
    LSTM(256, return_sequences=True),
    Dropout(0.2),
    LSTM(512, return_sequences=True),
    Dropout(0.3),
    LSTM(256),
    Dropout(0.2),
    Dense(128, activation='relu'),
    Dense(vocab_size, activation='softmax')
])

# Compile the model
model.compile(
    loss='categorical_crossentropy',
    optimizer=Adam(learning_rate=0.001),
    metrics=['accuracy']
)

# Model summary
model.summary()


In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau

# Define callbacks
callbacks = [
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True, verbose=1),
    ModelCheckpoint(filepath='best_model.keras', monitor='val_loss', save_best_only=True, verbose=1),
    ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, verbose=1, min_lr=1e-6)
]

# Train the model with 100 epochs
history = model.fit(
    X, y,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    callbacks=callbacks,
    verbose=1
)


Epoch 1/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0278 - loss: 6.7848
Epoch 1: val_loss improved from inf to 6.51761, saving model to best_model.keras
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 74ms/step - accuracy: 0.0278 - loss: 6.7838 - val_accuracy: 0.0277 - val_loss: 6.5176 - learning_rate: 0.0010
Epoch 2/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step - accuracy: 0.0319 - loss: 6.2796
Epoch 2: val_loss did not improve from 6.51761
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 71ms/step - accuracy: 0.0319 - loss: 6.2796 - val_accuracy: 0.0365 - val_loss: 6.6380 - learning_rate: 0.0010
Epoch 3/100
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 66ms/step - accuracy: 0.0274 - loss: 6.2401
Epoch 3: val_loss did not improve from 6.51761
[1m226/226[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 72ms/step - accuracy: 0.0274 - loss: 6

In [None]:
model.save('autocomplete_lstm_model.h5')


In [None]:
def predict_next_word(model, tokenizer, text, max_length):
    sequence = tokenizer.texts_to_sequences([text])[0]
    sequence = pad_sequences([sequence], maxlen=max_length - 1, padding='pre')
    predicted_index = model.predict(sequence).argmax(axis=1)[0]
    return tokenizer.index_word[predicted_index]

test_input = "Hi! What is your"
for i in range (10):
    pred=predict_next_word(model, tokenizer, test_input, max_length)

    test_input+=pred
    print("Prediction:",test_input)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
Prediction: Hi! What is yourto
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Hi! What is yourtoto
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
Prediction: Hi! What is yourtototo
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Hi! What is yourtotototo
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Hi! What is yourtototototo
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Hi! What is yourtotototototo
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Hi! What is yourtototototototo
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Hi! What is yourtotototototototo
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Prediction: Hi! What is yourtototototototototo


In [None]:
import keras_tuner as kt
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Define the model for hyperparameter tuning
def build_model(hp):
    model = Sequential([
        Embedding(
            input_dim=vocab_size,
            output_dim=hp.Choice('embedding_dim', [500,1000,2000,4000]),
            input_length=max_length - 1
        ),
        LSTM(
            units=hp.Int('lstm_units1', min_value=128, max_value=512, step=64),
            return_sequences=True
        ),
        Dropout(hp.Float('dropout1', min_value=0.1, max_value=0.5, step=0.1)),
        LSTM(
            units=hp.Int('lstm_units2', min_value=256, max_value=512, step=64),
            return_sequences=True
        ),
        Dropout(hp.Float('dropout2', min_value=0.1, max_value=0.5, step=0.1)),
        LSTM(
            units=hp.Int('lstm_units3', min_value=128, max_value=512, step=64)
        ),
        Dropout(hp.Float('dropout3', min_value=0.1, max_value=0.5, step=0.1)),
        Dense(
            units=hp.Int('dense_units', min_value=64, max_value=256, step=64),
            activation='relu'
        ),
        Dense(vocab_size, activation='softmax')
    ])

    model.compile(
        loss='categorical_crossentropy',
        optimizer=Adam(learning_rate=hp.Choice('learning_rate', [1e-3, 5e-4, 1e-4])),
        metrics=['accuracy']
    )
    return model

# Initialize the tuner
tuner = kt.Hyperband(
    build_model,
    objective='val_loss',
    max_epochs=20,
    factor=3,
    directory='hyperparameter_tuning',
    project_name='autocomplete_lstm'
)

# Callback to stop early if no improvement
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

# Search for the best hyperparameters
tuner.search(X, y, epochs=5, validation_split=0.2, callbacks=[stop_early])

# Get the best hyperparameters
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

# Build the best model
best_model = tuner.hypermodel.build(best_hps)

# Train the best model
history = best_model.fit(
    X, y,
    epochs=100,
    batch_size=64,
    validation_split=0.2,
    callbacks=[stop_early]
)

# Save the best model
best_model.save('best_autocomplete_model.h5')


Trial 28 Complete [00h 01m 54s]
val_loss: 6.516904830932617

Best val_loss So Far: 6.367918014526367
Total elapsed time: 01h 02m 01s

Search: Running Trial #29

Value             |Best Value So Far |Hyperparameter
500               |1000              |embedding_dim
256               |128               |lstm_units1
0.3               |0.3               |dropout1
448               |256               |lstm_units2
0.4               |0.1               |dropout2
256               |384               |lstm_units3
0.1               |0.5               |dropout3
192               |256               |dense_units
0.001             |0.001             |learning_rate
20                |7                 |tuner/epochs
0                 |0                 |tuner/initial_epoch
0                 |1                 |tuner/bracket
0                 |0                 |tuner/round

Epoch 1/20
[1m452/452[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 50ms/step - accuracy: 0.0277 - loss: 6.7189 - val_a