In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

import argparse

import numpy as np
import pandas as pd

import re

import pickle as pkl

import sys

import tensorflow as tf
from tensorflow import keras

import autokeras as ak # !pip install autokeras

In [2]:
def is_interactive():
    import __main__ as main
    return not hasattr(main, '__file__')

# print('interactive? ', is_interactive())

In [3]:
out_dir = 'out' # folder containing the model and other files needed for classification

classes, aux_data = pkl.load(open(f'{out_dir}/aux_data.pkl', 'rb'))

In [4]:
parser = argparse.ArgumentParser(description='Movie Classifier',
                                 epilog=f"Classifies movies into {len(classes)} genres: {', '.join(classes)}",
                                 add_help=True)

parser.add_argument('filename')

_StoreAction(option_strings=[], dest='filename', nargs=None, const=None, default=None, type=None, choices=None, required=True, help=None, metavar=None)

In [5]:
if is_interactive():
    # input_text = 'successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero'
    args = ['test.txt']
else: 
    args = sys.argv[1:]    
    
prs = parser.parse_args(args)

with open(prs.filename, 'rt') as f:
    input_text = f.read()

In [6]:
cleaners = {
    'text_num': lambda _ : re.sub(r'\S*[^a-zA-Z\s\.\,0-9-]+\S*', '', _),
    'caps_words': lambda _ : re.sub(r'\S*[A-Z]+\S*', '', _),
    'punct': lambda _ : re.sub(r'[^\w\s]+', '', _),
    'numbers': lambda _ : re.sub(r'[0-9]+', '', _),
    'spaces': lambda _ : re.sub(r'\s+', ' ', _),
    'space_comma': lambda _ : re.sub(r'\s,', ',', _),
    'space_period': lambda _ : re.sub(r'\s\.', '.', _)
}

In [7]:
def clean(text):
    
    for k in ['text_num', 'spaces', 'space_comma', 'space_period']:
        
        text = cleaners[k](text)
        
    return text    

In [8]:
cleaned_input_text = clean(input_text)

In [9]:
tf_model = keras.models.load_model(f'{out_dir}/tf_model', custom_objects=ak.CUSTOM_OBJECTS)

In [10]:
pred = tf_model.predict([cleaned_input_text], verbose=0)[0]

In [11]:
mask = pred/pred.max() > 0.95

### choosing 5 most similar movies based on cosine similarities (>0.9) of the predictions.

In [12]:
cossim = lambda a, b : np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))

sim = np.array([-cossim(_, pred) for _ in aux_data['pred']])

idx = np.argsort(sim)[:5]

idx = idx[-sim[idx] > 0.9]

In [13]:
scores = '/'.join([f'{_:.2f}' for _ in pred[mask]])

In [14]:
out_text = f"\nGenre: {'/'.join(classes[mask])} ({scores})\nSimilar Movies:\n"

for i, j in enumerate(idx, start=1):
    
    out_text += f"{i}. {aux_data['name'][j]} ({aux_data['year'][j]}) {'/'.join(aux_data['genres'][j])}\n"
    
print(out_text)    


Genre: Comedy (0.81)
Similar Movies:
1. Josie and the Pussycats (2001) Comedy/Musical
2. Drillbit Taylor (2008) Comedy
3. Jay and Silent Bob Strike Back (2001) Comedy
4. Bio-Dome (1996) Comedy
5. Still Smokin' (1983) Comedy

