In [1]:
import pandas as pd
import numpy as np

## Read Data

In this section, we read in the `movie` dataframe, which contains information about each movie title.

The `movies` dataframe consists of 9 columns:

 * `tconst` - unique id of a movie
 * `primaryTitle` - title of the movie
 * `isAdult` - 0 or 1, indicating whether a movie is rated for adults or not
 * `startYear` - release year of the movie
 * `runtimeMinutes` - runtime of the movie, in minutes
 * `genres` - genres of the movie
 * `primaryName` - names of prominent cast/crew members
 * `category` - categories (roles) of cast/crew members specified in `primaryName`
 * `characters` - characters played by cast/crew members (if no character played by a member, then will be an empty string)

In [2]:
movies = pd.read_csv('../movies.csv', index_col=0)
movies.head()

Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,primaryName,category,characters
0,tt0000009,Miss Jerry,0,1894,45,Romance,"['Blanche Bayliss', 'Alexander Black', 'Willia...","['actress', 'director', 'actor', 'actor']","['[""Miss Geraldine Holbrook (Miss Jerry)""]', '..."
1,tt0000502,Bohemios,0,1905,100,\N,"['Ricardo de Baños', 'Antonio del Pozo', 'El M...","['director', 'actor', 'actor']","['\\N', '\\N', '\\N']"
2,tt0000574,The Story of the Kelly Gang,0,1906,70,"Biography,Crime,Drama","['Bella Cola', 'Charles Tait', 'Elizabeth Tait...","['actress', 'director', 'actress', 'actor', 'c...","['\\N', '\\N', '[""Kate Kelly""]', '[""School Mas..."
3,tt0000679,The Fairylogue and Radio-Plays,0,1908,120,"Adventure,Fantasy","['L. Frank Baum', 'Francis Boggs', 'Frank Burn...","['actor', 'director', 'actor', 'composer', 'di...","['[""The Wizard of Oz Man""]', '\\N', '[""His Maj..."
4,tt0001184,Don Juan de Serrallonga,0,1910,58,"Adventure,Drama","['Ricardo de Baños', 'Alberto Marro', 'Dolores...","['director', 'director', 'actress', 'actor']","['\\N', '\\N', '\\N', '\\N']"


## Clean Movie Information

Currently, some rows in `movies` may have punctuation, uppercases, and digits in the `primaryTitle`, `genres`, `primaryName`,
`category` and `characters` columns. We must remove these as part of the preprocessing step.

As the `genres`, `primaryName`, `category` and `characters` columns may consist of a list of values, we want to ensure
that each item is separated by a `,`. The `characters` column in particular may have multiple character names per
item (as each item in this list refers to one cast/crew member). These characters will be separated by a `/`.
Some cast/crew members may not have a character associated, so for the corresponding item, the value in the list will
be an empty string.

In [24]:
import re
import shared_functions.cleaning as cleaning

def update_movie_with_clean_text(row):
    """
    Updates the given row with clean text for its primaryTitle, genres, primaryName, category
    and character columns.

    Parameters:
    row (Series): row that contains unclean text

    Returns:
    Row with cleaned text for the aforementioned columns.

   """
    # retrieve cleaned version of the title of the movie
    row.primaryTitle = cleaning.clean_text(row.primaryTitle)

    # if there are no genres given, put nan. else, retrieve genres in the form "romance,drama,tragedy"
    if row.genres == '\\N':
        row.genres = np.nan
    else:
        row.genres = ','.join([cleaning.clean_text(g) for g in row.genres.split(',')])

    # retrieve cast & crew in the form "john doe,jane doe,bob smith"
    row.primaryName = ','.join([cleaning.clean_text(n) for n in row.primaryName.split(',')])

    # retrieve jobs of the cast & crew in the form "director,actor,actress"
    row.category = ','.join([cleaning.clean_text(n) for n in row.category.split(',')])

    cleaned_characters = []

    # will give the roles (character names) portrayed by each actor (in a list)
    # if a cast/crew member does not play a character role, gives an empty string
    actor_roles = re.findall('\["(.*?)"\]|\\\\N', row.characters[1:-1])

    for role in actor_roles:
        # retrieve characters portrayed by this actor in the form "lucy/annie/sophie"
        # takes into acct other characters (",") or alternate names ("()" or "-")
        cleaned_characters.append('/'.join([cleaning.clean_text(char).strip() for r in role.split(',') for char in re.split(' - | \(', r)]))

    row.characters = ','.join(cleaned_characters)
    print(row.characters)
    return row

In [28]:
# # write the cleaned movies to csv
# clean_movies = pd.DataFrame([update_movie_with_clean_text(row) for _, row in movies.iloc[:3, :].iterrows()])
# clean_movies.to_csv('../clean_movies.csv')

clean_movies = pd.read_csv('../clean_movies.csv', index_col=0)
clean_movies.head()

Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,primaryName,category,characters
0,tt0000009,miss jerry,0,1894,45,romance,"blanche bayliss,alexander black,william courte...","actress,director,actor,actor","miss geraldine holbrook/miss jerry,,mr hamilto..."
1,tt0000502,bohemios,0,1905,100,,"ricardo de baños,antonio del pozo,el mochuelo","director,actor,actor",",,"
2,tt0000574,the story of the kelly gang,0,1906,70,"biography,crime,drama","bella cola,charles tait,elizabeth tait,john ta...","actress,director,actress,actor,composer,actor",",,kate kelly,school master,,steve hart"
3,tt0000679,the fairylogue and radioplays,0,1908,120,"adventure,fantasy","l frank baum,francis boggs,frank burns,nathani...","actor,director,actor,composer,director,actor,a...","the wizard of oz man,,his majesty the scarecro..."
4,tt0001184,don juan de serrallonga,0,1910,58,"adventure,drama","ricardo de baños,alberto marro,dolores puchol,...","director,director,actress,actor",",,,"


## Create Movie Mapping

As reviews may contain words unique to the movie such as names of the cast/crew or title of the movie, we want to create
a mapping of these terms to more generic words such as "actor" or "actress". We create a dictionary mapping of such words
for each movie below.

In additional to mapping the film title and full name of the cast/crew to a more generic term, we also map the last name
of a cast/crew member to their role as such individuals are often referred to by their last names only.

In [71]:
def create_mapping(movie_row):
    """
    Creates a mapping between key words (title, cast/crew names) in this movie and their replacement.

    For topic modelling purposes, it is preferred that the title of the movie, cast/crew names and character names
    be replaced with a generic word (either "movie" or that cast/crew/character's role in the movie.) By doing this, we
    can more easily detect when a user is referring to an actor/actress's performance, the director's choices, etc.

    Parameters:
    movie_row (Series):

    Returns:
    Mapping (dictionary) of a word and its replacement.

   """
    # get the tconst, title, cast/crew names, categories (actor, director, etc.) and character names from movie
    tconst, title, cast_crew, categories, characters = get_movie_attributes(movie_row)

    # # removing empty strings from cast/crew list
    # cast_crew = [c for c in cast_crew if c]

    # intially map each cast/crew member to their category
    movie_mapping = dict(zip(cast_crew, categories))

    # map the title of the movie to "movie"
    movie_mapping[title] = 'movie'

    for i in range(len(cast_crew)):
        try:
            # map the last name of each cast/crew member to their category
            last_name = cast_crew[i].split()[-1]
            movie_mapping[last_name] = categories[i]
        except:
            pass

        try:
            # an actor/actress may perform multiple roles, separated by a "/"
            # each character they play is mapped to the actor/actress's category
            for c in characters[i].split('/'):
                if c:
                    movie_mapping[c] = categories[i]
        except:
            pass

    return {'tconst': tconst, 'mapping': movie_mapping}

def get_movie_attributes(movie_row):
    """
    Gives the title, list of cast & crew, list of categories of the cast & crew (e.g. director, actress, etc.) and
    the characters each cast/crew member played.

    If the cast/crew member does not play a character, then the list at that index will be an empty string.
    An actor/actress may play several characters. If so, the character names will be separated by a "/" at the
    same index.

    Parameters:
    movie (Series): row that contains the title of the movie, string containing all cast/crew members, string
    containing their category and string containing what character they play.

    Returns:
    title of movie, list of cast and crew, list of categories, list of characters

   """
    tconst = movie_row.tconst
    title = movie_row.primaryTitle
    cast_crew = movie_row.primaryName.split(',')
    categories = movie_row.category.split(',')
    characters = movie_row.characters
    if isinstance(characters, str):
        characters = characters.split(',')
    return tconst, title, cast_crew, categories, characters


In [76]:
mapping = pd.DataFrame([create_mapping(row) for _, row in clean_movies.iterrows()])
mapping.to_pickle('../pickles/movie_mapping.pickle')
