In [4]:
import numpy as np
import pandas as pd
import re
import itertools
from collections import Counter

In [21]:
imdb = pd.read_csv("../../01_Data/Outputs/storyline_with_genres.csv", index_col=0)

In [3]:
imdb.head()

Unnamed: 0_level_0,storyline,Drama,Comedy,Thriller,Action,Romance,Adventure,Crime,Sci-Fi,Fantasy,...,Biography,Animation,Music,War,History,Sport,Musical,Documentary,Western,Others
movie_title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Avatar,"When his brother is killed in a robbery, parap...",0,0,0,1,0,1,0,1,1,...,0,0,0,0,0,0,0,0,0,0
Pirates of the Caribbean: At World's End,"After Elizabeth, Will, and Captain Barbossa re...",0,0,0,1,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
Spectre,A cryptic message from the past sends James Bo...,0,0,1,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
The Dark Knight Rises,Despite his tarnished reputation after the eve...,0,0,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Star Wars: Episode VII - The Force Awakens,,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [12]:
def clean_str(string):
    if pd.notnull(string):
        string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
        string = re.sub(r"\'s", " \'s", string)
        string = re.sub(r"\'ve", " \'ve", string)
        string = re.sub(r"n\'t", " n\'t", string)
        string = re.sub(r"\'re", " \'re", string)
        string = re.sub(r"\'d", " \'d", string)
        string = re.sub(r"\'ll", " \'ll", string)
        string = re.sub(r",", " , ", string)
        string = re.sub(r"!", " ! ", string)
        string = re.sub(r"\(", " \( ", string)
        string = re.sub(r"\)", " \) ", string)
        string = re.sub(r"\?", " \? ", string)
        string = re.sub(r"\s{2,}", " ", string)
        return string.strip().lower()
    else:
        return "NA"

In [16]:
imdb['storyline'][:5].apply(lambda x: clean_str(x))[0]

"when his brother is killed in a robbery , paraplegic marine jake sully decides to take his place in a mission on the distant world of pandora there he learns of greedy corporate figurehead parker selfridge 's intentions of driving off the native humanoid na'vi in order to mine for the precious material scattered throughout their rich woodland in exchange for the spinal surgery that will fix his legs , jake gathers intel for the cooperating military unit spearheaded by gung ho colonel quaritch , while simultaneously attempting to infiltrate the na'vi people with the use of an avatar identity while jake begins to bond with the native tribe and quickly falls in love with the beautiful alien neytiri , the restless colonel moves forward with his ruthless extermination tactics , forcing the soldier to take a stand and fight back in an epic battle for the fate of pandora"

In [11]:
imdb['storyline'][0]

'When his brother is killed in a robbery, paraplegic Marine Jake Sully decides to take his place in a mission on the distant world of Pandora. There he learns of greedy corporate figurehead Parker Selfridge\'s intentions of driving off the native humanoid "Na\'vi" in order to mine for the precious material scattered throughout their rich woodland. In exchange for the spinal surgery that will fix his legs, Jake gathers intel for the cooperating military unit spearheaded by gung-ho Colonel Quaritch, while simultaneously attempting to infiltrate the Na\'vi people with the use of an "avatar" identity. While Jake begins to bond with the native tribe and quickly falls in love with the beautiful alien Neytiri, the restless Colonel moves forward with his ruthless extermination tactics, forcing the soldier to take a stand - and fight back in an epic battle for the fate of Pandora.'

## Load data and labels

In [12]:
def load_data_and_labels(positive_data_file, negative_data_file):
    """
    Original function from https://github.com/dennybritz/cnn-text-classification-tf/blob/master/data_helpers.py
    Loads MR polarity data from files, splits the data into words and generates labels.
    Returns split sentences and labels.
    """
    # Load data from files
    positive_examples = list(open(positive_data_file, "r").readlines())
    positive_examples = [s.strip() for s in positive_examples]
    negative_examples = list(open(negative_data_file, "r").readlines())
    negative_examples = [s.strip() for s in negative_examples]
    # Split by words
    x_text = positive_examples + negative_examples
    x_text = [clean_str(sent) for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)
    return [x_text, y]

In [2]:
def get_labels(df):
    labels = df.columns.tolist()
    labels.remove('storyline')
    return labels

In [24]:
get_labels(imdb)

['Drama',
 'Comedy',
 'Thriller',
 'Action',
 'Romance',
 'Adventure',
 'Crime',
 'Sci-Fi',
 'Fantasy',
 'Horror',
 'Family',
 'Mystery',
 'Biography',
 'Animation',
 'Music',
 'War',
 'History',
 'Sport',
 'Musical',
 'Documentary',
 'Western',
 'Others']

In [23]:
def load_data_and_gen_labels(file_path):
    # Read data
    df = pd.read_csv(file_path, index_col=0)
    # Clean the string
    x_text = [clean_str(sent) for sent in df['storyline']]
    # Generate labels
    labels = get_labels(df)
    y = df[labels].values
    return [x_text, y]

In [24]:
x_text, y = load_data_and_gen_labels("../../01_Data/Outputs/storyline_with_genres.csv")

In [31]:
def batch_iter(data, batch_size, num_epochs, shuffle=True):
    """
    Generates a batch iterator for a dataset.
    """
    data = np.array(data)
    data_size = len(data)
    num_batches_per_epoch = int((len(data)-1)/batch_size) + 1
    for epoch in range(num_epochs):
        # Shuffle the data at each epoch
        if shuffle:
            shuffle_indices = np.random.permutation(np.arange(data_size))
            shuffled_data = data[shuffle_indices]
        else:
            shuffled_data = data
        for batch_num in range(num_batches_per_epoch):
            start_index = batch_num * batch_size
            end_index = min((batch_num + 1) * batch_size, data_size)
            yield shuffled_data[start_index:end_index]