# you can visit the deployed model here
## https://beyond-far-away.herokuapp.com

### It takes a few seconds to load >.<

In [1]:
import numpy as np
import pandas as pd
import re
import pickle
import nltk
import string

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, Embedding, LSTM, Bidirectional,Flatten,Dropout
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical


## Define cleaning functions

In [2]:
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

## Load Data

In [3]:
#dataset with lines from start wars and star trek movies
df = pd.read_csv('wars_trek_random.csv', encoding='ISO-8859-1')

## Change data type to avoid conflict

In [4]:
df['Input'] = df['Input'].astype(str)

## Remove punctuation

In [5]:
df['clean'] = df['Input'].apply(lambda x: remove_punctuation(x))
df['clean']

0        turmoil has engulfed the galactic republic the...
1                     outlaying star systems is in dispute
2        hoping to resolve the matter with a blockade o...
3        greedy trade federation has stopped all shippi...
4                                                    naboo
                               ...                        
18068                       new civilizations to boldly go
18069                         where no man has gone before
18070     she is moving out now passing camera and heading
18071    toward the distant stars she is beautiful and ...
18072    are beautiful and as she slowly disappears fro...
Name: clean, Length: 18073, dtype: object

## Tokenize the rows

In [6]:
tokenizer1 = RegexpTokenizer(r'\w+')

In [7]:
df['clean'] = df['clean'].apply(lambda x: tokenizer1.tokenize(x.lower()))
df['clean']

0        [turmoil, has, engulfed, the, galactic, republ...
1              [outlaying, star, systems, is, in, dispute]
2        [hoping, to, resolve, the, matter, with, a, bl...
3        [greedy, trade, federation, has, stopped, all,...
4                                                  [naboo]
                               ...                        
18068                 [new, civilizations, to, boldly, go]
18069                  [where, no, man, has, gone, before]
18070    [she, is, moving, out, now, passing, camera, a...
18071    [toward, the, distant, stars, she, is, beautif...
18072    [are, beautiful, and, as, she, slowly, disappe...
Name: clean, Length: 18073, dtype: object

## Tokenize the words

In [8]:
tokenizer = Tokenizer(num_words=5000,lower=True,split=' ')

In [9]:
tokenizer.fit_on_texts(df['Input'])

## Split the data

In [10]:
df = df.sample(frac = 1, random_state=13)

In [11]:
X = tokenizer.texts_to_sequences(df['Input'])
X = pad_sequences(X,maxlen=500)
Y = df['Value']
vocab_size = len(tokenizer.word_index) + 1

In [12]:
x, X_test, y, Y_test = train_test_split(X, Y, test_size=0.2, random_state = 24)

In [13]:
X_train, X_eval, Y_train, Y_eval = train_test_split(x, y, test_size=0.1, random_state = 24)

In [14]:
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [15]:
model = Sequential()
model.add(Embedding(vocab_size, 50, input_length=500)) 
model.add(Bidirectional(LSTM(128))) #number of batches
model.add(Dropout(0.5)) #randomily removing some of the neurons from the architecture to decrease overfiting
model.add(Dense(1,activation='sigmoid')) #it regulates the outputs 
model.compile(optimizer='adam',loss='binary_crossentropy', metrics=['accuracy']) #regulate dropout
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 500, 50)           82900     
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               183296    
_________________________________________________________________
dropout (Dropout)            (None, 256)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 257       
Total params: 266,453
Trainable params: 266,453
Non-trainable params: 0
_________________________________________________________________


## Train the model

In [17]:
history=model.fit(X_train, Y_train, batch_size=128, epochs=4, validation_data=[X_test, Y_test])

## Evaluate the model

In [None]:
results = model.evaluate(X_eval, Y_eval, verbose = 1)

In [None]:
model.save('stars15.h5')

In [18]:
from keras.models import load_model

In [19]:
model = load_model('stars15.h5')

In [20]:
lst = [
    'may the force be with you',
    'beam me up scotty',
    'do or do not there is no try',
    'logic is the beginning of wisdom not the end',
    'never tell me the odds'
]
for i in lst:
    
    x_2=tokenizer.texts_to_sequences([i])
    x_2 = pad_sequences(x_2,maxlen=500)
    prediction = model.predict(x_2)[0][0]

    if prediction > 0.6:
        print('Star Wars with {:.2f}% confidence'.format(prediction*100))
    elif prediction < 0.4:
        print('Star Trek with {:.2f}% confidence'.format((1-prediction)*100))
    else:
        print('Not sure!')


Star Wars with 99.99% confidence
Star Trek with 99.90% confidence
Star Wars with 99.60% confidence
Star Trek with 99.75% confidence
Star Wars with 99.95% confidence
