<a href="https://colab.research.google.com/github/iavinashpathak/GhostWriter/blob/main/GhostWriter.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
tech_data_raw = [
    "The future of artificial intelligence is machine learning.",
    "Neural networks are the architecture of deep learning.",
    "Python remains the top language for data science.",
    "Big data and cloud computing are highly scalable."
]

In [3]:
print(f"Total sentences: {len(tech_data_raw)}")
print(f"First sentence: {tech_data_raw[0]}")

Total sentences: 4
First sentence: The future of artificial intelligence is machine learning.


In [4]:
tokenizer = Tokenizer(oov_token="<unk>")

In [5]:
tokenizer.fit_on_texts(tech_data_raw)

In [6]:
input_sequences = tokenizer.texts_to_sequences(tech_data_raw)

In [7]:
vocab_size = len(tokenizer.word_index) + 1

print(f"Original Text: {tech_data_raw[0]}")
print(f"Tokenized Sequence: {input_sequences[0]}")
print(f"Vocabulary Size: {vocab_size}")

Original Text: The future of artificial intelligence is machine learning.
Tokenized Sequence: [2, 7, 3, 8, 9, 10, 11, 4]
Vocabulary Size: 28


In [8]:
input_sequences_flat = []

In [9]:
for sequence in input_sequences:
    for i in range(1, len(sequence)):
        n_gram_sequence = sequence[:i+1]
        input_sequences_flat.append(n_gram_sequence)

In [10]:
print(f"Total N-gram sequences created: {len(input_sequences_flat)}")
print(f"Example N-gram (w1, w2, w3, w4): {input_sequences_flat[5]}")

Total N-gram sequences created: 28
Example N-gram (w1, w2, w3, w4): [2, 7, 3, 8, 9, 10, 11]


In [11]:
max_sequence_len = max([len(x) for x in input_sequences_flat])
print(f"Max Sequence Length: {max_sequence_len}")

Max Sequence Length: 8


In [12]:
print(f"Max Sequence Length: {max_sequence_len}")

Max Sequence Length: 8


In [13]:
padded_sequences = pad_sequences(input_sequences_flat, maxlen=max_sequence_len, padding='pre')

In [14]:
X = padded_sequences[:, :-1]
Y_raw = padded_sequences[:, -1]

In [15]:
Y = to_categorical(Y_raw, num_classes=vocab_size)

In [16]:
print("-" * 30)
print("Data Preparation Summary:")
print(f"Padded Input Shape (X): {X.shape}")
print(f"Target Output Shape (Y): {Y.shape}")
print(f"Example X (Input sequence): {X[5]}")
print(f"Example Y (Target one-hot vector): {Y[5]}")

------------------------------
Data Preparation Summary:
Padded Input Shape (X): (28, 7)
Target Output Shape (Y): (28, 28)
Example X (Input sequence): [ 0  2  7  3  8  9 10]
Example Y (Target one-hot vector): [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0.]


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

In [18]:
EMBEDDING_DIM = 100
LSTM_UNITS = 128
INPUT_LENGTH = X.shape[1]

In [19]:
model = Sequential()

# Embedding Layer
model.add(Embedding(input_dim=vocab_size, output_dim=EMBEDDING_DIM, input_length=INPUT_LENGTH))

# Stacked LSTM Layers
model.add(LSTM(LSTM_UNITS, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(LSTM_UNITS))
model.add(Dropout(0.2))

# Output Layer
model.add(Dense(vocab_size, activation='softmax'))



In [20]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [21]:
model.summary()

In [22]:
history = model.fit(X, Y, epochs=50, verbose=1, batch_size=32)

Epoch 1/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.0000e+00 - loss: 3.3327
Epoch 2/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1429 - loss: 3.3275
Epoch 3/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.2143 - loss: 3.3218
Epoch 4/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.2143 - loss: 3.3160
Epoch 5/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.1786 - loss: 3.3074
Epoch 6/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.2143 - loss: 3.3035
Epoch 7/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1429 - loss: 3.2897
Epoch 8/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.2143 - loss: 3.2765
Epoch 9/50
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m 

In [23]:
final_accuracy = history.history['accuracy'][-1]
print(f"\nFinal Training Accuracy: {final_accuracy:.4f}")


Final Training Accuracy: 0.5357


In [24]:
import pickle

In [25]:
model_file_path = 'model.h5'
model.save(model_file_path)
print(f"Model successfully saved to {model_file_path}")



Model successfully saved to model.h5


In [26]:
tokenizer_file_path = 'tokenizer.pickle'
with open(tokenizer_file_path, 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
print(f"Tokenizer successfully saved to {tokenizer_file_path}")

Tokenizer successfully saved to tokenizer.pickle
