In [121]:
import random
import numpy as np
import os
from keras.layers import Dense, Embedding, Flatten, Dropout
from keras.models import Sequential
from keras.activations import sigmoid
from keras.metrics import Accuracy
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from gtts import gTTS
from time import sleep
import pickle

In [None]:
#Returns the lengths of each file for the name generator
length_dict = {}
for file in os.listdir("curate"):
    path = os.path.join("curate", file)
    f = open(path)
    length_dict[file] = len(f.readlines())
f.close()

In [None]:
#Name Generator
train_names = []
num_names = 100000
name_indices = random.sample(range(51075960), num_names)
name_indices.sort()
sum = 0
i = -1
for index in name_indices:
    while sum < index:
        i += 1
        f = open(os.path.join("curate", list(length_dict.keys())[i]))
        current = f.readlines()
        f.close()
        sum += list(length_dict.values())[i]
    train_names.append(current[sum-index].split(",")[slice(2)])

In [None]:
#Word Generator
train_words = []
num_words = 200000
word_indices = random.sample(range(466550), num_words)
f = open("english.txt")
lines = f.readlines()
f.close()
for index in word_indices:
    train_words.append(lines[index][:-1])

In [None]:
#Mixing Datasets
train_labels = np.concatenate((np.ones(2*num_names), np.zeros(num_words)))
train_inputs = []
for item in train_names:
    train_inputs.extend([item[0], item[1]])
train_inputs.extend(train_words)
train_df = np.stack((train_labels, train_inputs), axis=-1)
np.random.shuffle(train_df)
train_labels = np.array([item[0] for item in train_df]).astype(float)
train_inputs = np.array([item[1] for item in train_df])

In [None]:
#Tokenization
maxlen=4
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_inputs)
train_inputs = np.array(pad_sequences(tokenizer.texts_to_sequences(train_inputs), maxlen=maxlen))

In [None]:
#Model
model = Sequential()
model.add(Embedding(10000, 16, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation=sigmoid))

model.compile(optimizer="adam", loss="binary_crossentropy")

history = model.fit(x=train_inputs, y=train_labels, epochs=5)

In [None]:
#Validation on Words
f = open("validation.txt")
samples = [item[:-1] for item in f.readlines()]
f.close()
input = pad_sequences(tokenizer.texts_to_sequences(samples), maxlen=maxlen)
out = [item[0] for item in list(map(np.round, list(model.predict(input))))]
sum = 0
for i in range(1000):
    if out[i] == 1:
        sum += 1
print(sum)

In [None]:
#AI TTS
string = ""
number = float(model.predict(pad_sequences(tokenizer.texts_to_sequences([string]), maxlen=maxlen))[0])
text = f'I am {round(number*100)} percent sure that {string} is a name.'
  
tts = gTTS(text=text, lang="en")

tts.save("predict.mp3")

os.system("open predict.mp3")

sleep(4)

os.system("rm predict.mp3")

In [None]:
#Save Weights
model.save_weights('weights2')

In [None]:
#Load Weights
model.load_weights('weights1')

'''
Model Code:
1 - Embedding Layer Approach:
model = Sequential()
model.add(Embedding(10000, 32, input_length=maxlen))
model.add(Flatten())
model.add(Dense(128))
model.add(Dense(1, activation=sigmoid))
2 - Same as above, but counters overfitting:
model = Sequential()
model.add(Embedding(10000, 16, input_length=maxlen))
model.add(Flatten())
model.add(Dense(64))
model.add(Dropout(0.2))
model.add(Dense(1, activation=sigmoid))
'''

In [122]:
#Save Tokenizer
with open('tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)