In [1]:
import os
import random
from tqdm import tqdm

def get_random_text(text, chunk_size=200):
    start_index = random.randint(0, len(text) - chunk_size)
    return text[start_index:start_index + chunk_size]

def sample_scripts(input_folder, output_file):
    with open(output_file, 'w', encoding='utf-8') as output:
        for filename in tqdm(os.listdir(input_folder)):
            if filename.endswith(".txt"):
                file_path = os.path.join(input_folder, filename)
                
                with open(file_path, 'r', encoding='utf-8') as file:
                    file_content = file.read()
                    #random_text = get_random_text(file_content)
                    #output.write(random_text + '\n')
                    output.write(file_content + '\n')

input_folder = "movie_scripts"
output_file = "samplerv2.txt" 

sample_scripts(input_folder, output_file)

100%|██████████| 1092/1092 [00:57<00:00, 19.09it/s]


In [13]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

def load_doc(filename):
	file = open(filename, 'r')
	text = file.read()
	file.close()
	return text

doc = load_doc('bladeTrilogy.txt')
lines = doc.split('\n')

tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)
sequences = pad_sequences(sequences)
unique_words_count = len(tokenizer.word_index) + 1

sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=unique_words_count)
seq_length = X.shape[1]

model = Sequential()
model.add(Embedding(len(tokenizer.word_index) + 1, 50, input_length=seq_length))
model.add(LSTM(100, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(100, activation='relu'))
model.add(Dense(unique_words_count, activation='softmax'))
print(model.summary())

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

model.fit(X, y, batch_size=128, epochs=250)

model.save('model2.h5')
dump(tokenizer, open('tokenizer2.pkl', 'wb'))

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 44, 50)            347350    
                                                                 
 lstm_4 (LSTM)               (None, 44, 100)           60400     
                                                                 
 lstm_5 (LSTM)               (None, 100)               80400     
                                                                 
 dense_4 (Dense)             (None, 100)               10100     
                                                                 
 dense_5 (Dense)             (None, 6947)              701647    
                                                                 
Total params: 1199897 (4.58 MB)
Trainable params: 1199897 (4.58 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/250

In [14]:
import numpy as np
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
	result = list()
	in_text = seed_text
	for _ in range(n_words):
		encoded = tokenizer.texts_to_sequences([in_text])[0]
		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre', padding='post')
		y_pred = model.predict(encoded, verbose=0)
		predicted_index = np.argmax(y_pred)
		out_word = ''
		for word, index in tokenizer.word_index.items():
			if index == predicted_index:
				out_word = word
				break
		in_text += ' ' + out_word
		result.append(out_word)
	return ' '.join(result)

in_filename = 'samplerv1.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1

model = load_model('model.h5')

tokenizer = load(open('tokenizer.pkl', 'rb'))

seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

generated = generate_seq(model, tokenizer, 44, seed_text, 1000)
print(generated)

 Kenny glances to the door. He considers for a long moment. KENNY The friend. Kenny breaks the gaze. He begins to whistle again. The CAMERA drifts away, finding the far DOOR to the inner office, Kenny

fatma                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         