In [1]:
import warnings
from IPython.core.interactiveshell import InteractiveShell

# https://github.com/WillKoehrsen/recurrent-neural-networks/blob/master/notebooks/Deep%20Dive%20into%20Recurrent%20Neural%20Networks.ipynb
InteractiveShell.ast_node_interactivity = 'all'

warnings.filterwarnings('ignore', category=RuntimeWarning)

RANDOM_STATE = 50
EPOCHS = 100
BATCH_SIZE = 2048
TRAINING_LENGTH = 20
TRAIN_FRACTION = 0.7
LSTM_CELLS = 16
VERBOSE = 1
SAVE_MODEL = True

In [2]:
import pandas as pd
import numpy as np
import pickle


In [3]:
anime_data = pd.read_csv("./data/anime_dataset.csv")
book_data = pd.read_csv("./data/book_dataset.csv")
movie_data = pd.read_csv("./data/movie_dataset.csv")
kdrama_data = pd.read_csv("./data/kdrama_dataset.csv")
len(anime_data)
len(book_data)
len(movie_data)
len(kdrama_data)
# Preprocess data

17002

4657

10178

100

In [4]:
anime_data.head(1)

Unnamed: 0,Anime_id,Title,Genre,Synopsis,Type,Producer,Studio,Rating,ScoredBy,Popularity,Members,Episodes,Source,Aired,Link
0,1,Cowboy Bebop,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...","In the year 2071, humanity has colonized sever...",TV,['Bandai Visual'],['Sunrise'],8.81,363889.0,39.0,704490.0,26.0,Original,"Apr 3, 1998 to Apr 24, 1999",https://myanimelist.net/anime/1/Cowboy_Bebop


In [75]:
book_data.head(1)

Unnamed: 0,index,title,genre,summary
0,0,Drowned Wednesday,fantasy,Drowned Wednesday is the first Trustee among ...


In [5]:
movie_data.head(1)

Unnamed: 0,names,date_x,score,genre,overview,crew,orig_title,status,orig_lang,budget_x,revenue,country
0,Creed III,03/02/2023,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616668.0,AU


In [None]:
movie_data_genre_name = "genre"

In [77]:
kdrama_data.head(1)

Unnamed: 0,Name,Year of release,Aired Date,Aired On,Number of Episode,Network,Duration,Content Rating,Synopsis,Cast,Genre,Tags,Rank,Rating
0,Move to Heaven,2021,"May 14, 2021",Friday,10,Netflix,52 min.,18+ Restricted (violence & profanity),Geu Roo is a young autistic man. He works for ...,"Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju...","Life, Drama, Family","Autism, Uncle-Nephew Relationship, Death, Sava...",#1,9.2


## Preprocess data with no synopsis

In [78]:
anime_synopsis_column_name = "Synopsis"
book_synopsis_column_name = "summary"
movie_synopsis_column_name = "overview"
kdrama_synopsis_column_name = "Synopsis"

In [79]:
anime_data[anime_synopsis_column_name].isnull().sum()

1419

In [80]:
book_data[book_synopsis_column_name].isnull().sum()

0

In [81]:
movie_data[movie_synopsis_column_name].isnull().sum()

0

In [82]:
kdrama_data[kdrama_synopsis_column_name].isnull().sum()

0

In [83]:
anime_data.drop(anime_data[anime_data[anime_synopsis_column_name].isnull()].index, inplace=True)
book_data.drop(book_data[book_data[book_synopsis_column_name].isnull()].index, inplace=True)
movie_data.drop(movie_data[movie_data[movie_synopsis_column_name].isnull()].index, inplace=True)
kdrama_data.drop(kdrama_data[kdrama_data[kdrama_synopsis_column_name].isnull()].index, inplace=True)


In [84]:
print(anime_data[anime_synopsis_column_name].isnull().sum(), book_data[book_synopsis_column_name].isnull().sum(), movie_data[movie_synopsis_column_name].isnull().sum(), kdrama_data[kdrama_synopsis_column_name].isnull().sum())

0 0 0 0


# Gather all synopsis in to one

In [85]:
anime_synopsis = list(anime_data[anime_synopsis_column_name])
book_synopsis = list(book_data[book_synopsis_column_name])
movie_synopsis = list(movie_data[movie_synopsis_column_name])
kdrama_synopsis = list(kdrama_data[kdrama_synopsis_column_name])
original_synopsis = anime_synopsis + book_synopsis + movie_synopsis + kdrama_synopsis

In [86]:
len(original_synopsis)

30518

# Convert Text to Sequences

In [87]:
from keras.preprocessing.text import Tokenizer

In [88]:
import re


def format_patent(patent):
    """Add spaces around punctuation and remove references to images/citations."""

    # Add spaces around punctuation
    patent = re.sub(r'(?<=[^\s0-9])(?=[.,;?])', r' ', patent)

    # Remove references to figures
    patent = re.sub(r'\((\d+)\)', r'', patent)

    # Remove double spaces
    patent = re.sub(r'\s\s', ' ', patent)
    return patent


In [89]:
formatted = []

# Iterate through all the original abstracts
for patent in original_synopsis:
    # print(patent)
    formatted.append(format_patent(patent))

len(formatted)
print(formatted[0])

30518

In the year 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of planet Earth behind . The Inter Solar System Police attempts to keep peace in the galaxy , aided in part by outlaw bounty hunters , referred to as "Cowboys ." The ragtag team aboard the spaceship Bebop are two such individuals .   
Mellow and carefree Spike Spiegel is balanced by his boisterous , pragmatic partner Jet Black as the pair makes a living chasing bounties and collecting rewards . Thrown off course by the addition of new members that they meet in their travels—Ein , a genetically engineered , highly intelligent Welsh Corgi ; femme fatale Faye Valentine , an enigmatic trickster with memory loss ; and the strange computer whiz kid Edward Wong—the crew embarks on thrilling adventures that unravel each member's dark and mysterious past little by little .   Well-balanced with high density action and light-hearted comedy , Cowboy Bebop is a space W

In [90]:
def make_sequences(texts,
                   training_length=50,
                   lower=True,
                   filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'):
    """Turn a set of texts into sequences of integers"""

    # Create the tokenizer object and train on texts
    tokenizer = Tokenizer(lower=lower, filters=filters)

    # Train the tokenizer on the texts
    tokenizer.fit_on_texts(texts)

    # Create look-up dictionaries and reverse look-ups
    word_idx = tokenizer.word_index
    idx_word = tokenizer.index_word
    num_words = len(word_idx) + 1
    word_counts = tokenizer.word_counts

    print(f'There are {num_words} unique words.')

    # Convert text to sequences of integers
    sequences = tokenizer.texts_to_sequences(texts)

    # Limit to sequences with more than training length tokens
    seq_lengths = [len(x) for x in sequences]
    over_idx = [
        i for i, l in enumerate(seq_lengths) if l > (training_length + 20)
    ]

    new_texts = []
    new_sequences = []

    # Only keep sequences with more than training length tokens
    for i in over_idx:
        new_texts.append(texts[i])
        new_sequences.append(sequences[i])

    training_seq = []
    labels = []

    # Iterate through the sequences of tokens
    for seq in new_sequences:

        # Create multiple training examples from each sequence
        for i in range(training_length, len(seq)):
            # Extract the features and label
            extract = seq[i - training_length:i + 1]

            # Set the features and label
            # features will be all the words before the last, label is the last word
            training_seq.append(extract[:-1])
            labels.append(extract[-1])

    # we need to save the tokenizer for later tokenizing user input
    with open('tokenizer.pickle', 'wb') as handle:
        pickle.dump(tokenizer, handle)

    print(f'There are {len(training_seq)} training sequences.')

    # Return everything needed for setting up the model
    return word_idx, idx_word, num_words, word_counts, new_texts, new_sequences, training_seq, labels, tokenizer

In [91]:
filters = '!"#$%&()*+/:<=>@[\\]^_`{|}~\t\n'
# features is training_seq
word_idx, idx_word, num_words, word_counts, abstracts, sequences, features, labels, tokenizer = make_sequences(
    formatted, TRAINING_LENGTH, lower=True, filters=filters)

There are 113835 unique words.
There are 2920608 training sequences.


Each synopsis is now represented as a sequence of integers. Let's look at an example of a few features and the corresponding labels. The label is the next word in the sequence after the first 10 words.

In [92]:
n = 3
features[n][:10]

[43585, 566, 26, 8023, 273, 7, 1, 2010, 5, 8675]

In [93]:
def find_answer(index):
    """Find label corresponding to features for index in training data"""

    # Find features and label
    feats = ' '.join(idx_word[i] for i in features[index])
    answer = idx_word[labels[index]]
    # Features are the first 50 words
    print('Features:', feats)
    # Label is the last word (predicted word)
    print('\nLabel: ', answer)

In [94]:
find_answer(n)

Features: 2071, humanity has colonized several of the planets and moons of the solar system leaving the now uninhabitable surface of

Label:  planet


In [95]:
find_answer(70)

Features: is balanced by his boisterous , pragmatic partner jet black as the pair makes a living chasing bounties and collecting

Label:  rewards


In [96]:
sorted(word_counts.items(), key=lambda x: x[1], reverse=True)[:15]

[('the', 209902),
 (',', 185326),
 ('.', 156995),
 ('to', 102625),
 ('and', 99748),
 ('a', 98118),
 ('of', 89035),
 ('in', 54201),
 ('is', 48560),
 ('his', 37483),
 ('with', 30348),
 ('he', 29489),
 ('that', 29302),
 ('her', 28845),
 ('by', 23559)]

# Training Data