In [2]:
import pandas as pd
import numpy as np

## Read Data

In this section, we read in two csvs that contain data we have webscraped and data we have retrieved and consolidated from https://www.imdb.com/interfaces/. 

The `reviews` dataframe consists of 4 columns:
* `tconst` - unique id of a movie 
* `username` - username of the user who left a rating/review for this movie
* `rating` - user's rating for the movie out of 10 (1 being the lowest, 10 being the highest) 
* `review` - user's textual review of the movie 

The `movies` dataframe consists of 9 columns:

 * `tconst` - unique id of a movie 
 * `primaryTitle` - title of the movie 
 * `isAdult` - 0 or 1, indicating whether a movie is rated for adults or not
 * `startYear` - release year of the movie 
 * `runtimeMinutes` - runtime of the movie, in minutes 
 * `genres` - genres of the movie
 * `primaryName` - names of prominent cast/crew members 
 * `category` - categories (roles) of cast/crew members specified in `primaryName`
 * `characters` - characters played by cast/crew members (if no character played by a member, then will be an empty string)

In [6]:
number = 81551
reviews = pd.read_csv(f'./reviews/raw_reviews/raw_reviews_{number}.csv')
reviews

Unnamed: 0,tconst,username,rating,review
0,tt0018671,F Gwynplaine MacIntyre,2/10,Even though I've written some historical ficti...
1,tt0018673,Maleejandra,8/10,"Bare Knees is the epitome of the Jazz Age, but..."
2,tt0018673,kidboots,9/10,Sometimes you can get a truer picture of teen ...
3,tt0018673,David-240,9/10,This must be the ultimate flapper comedy - an ...
4,tt0018673,JohnHowardReid,10/10,A comedy-drama I greatly enjoyed was 1928's Ba...
...,...,...,...,...
996338,tt0126816,poolandrews,3/10,Blackjack starts as former FED Jack Devlin (Do...
996339,tt0126816,Jack_Yan,1/10,I caught this on television after the opening ...
996340,tt0126816,Miss K,1/10,This is easily one of the WORST movies i have ...
996341,tt0126816,sutcal,3/10,I have read many of the users comments regardi...


In [5]:
movies = pd.read_csv('./movies.csv', index_col=0)
movies

Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,primaryName,category,characters
0,tt0000009,Miss Jerry,0,1894,45,Romance,"['Blanche Bayliss', 'Alexander Black', 'Willia...","['actress', 'director', 'actor', 'actor']","['[""Miss Geraldine Holbrook (Miss Jerry)""]', '..."
1,tt0000502,Bohemios,0,1905,100,\N,"['Ricardo de Baños', 'Antonio del Pozo', 'El M...","['director', 'actor', 'actor']","['\\N', '\\N', '\\N']"
2,tt0000574,The Story of the Kelly Gang,0,1906,70,"Biography,Crime,Drama","['Bella Cola', 'Charles Tait', 'Elizabeth Tait...","['actress', 'director', 'actress', 'actor', 'c...","['\\N', '\\N', '[""Kate Kelly""]', '[""School Mas..."
3,tt0000679,The Fairylogue and Radio-Plays,0,1908,120,"Adventure,Fantasy","['L. Frank Baum', 'Francis Boggs', 'Frank Burn...","['actor', 'director', 'actor', 'composer', 'di...","['[""The Wizard of Oz Man""]', '\\N', '[""His Maj..."
4,tt0001184,Don Juan de Serrallonga,0,1910,58,"Adventure,Drama","['Ricardo de Baños', 'Alberto Marro', 'Dolores...","['director', 'director', 'actress', 'actor']","['\\N', '\\N', '\\N', '\\N']"
...,...,...,...,...,...,...,...,...,...
409726,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,0,2015,57,Documentary,"['Angela Gurgel', 'Ana Célia de Oliveira', 'Ol...","['director', 'director', 'actor']","['\\N', '\\N', '[""Rodolpho Teophilo""]']"
409727,tt9916680,De la ilusión al desconcierto: cine colombiano...,0,2007,100,Documentary,['Luis Ospina'],['director'],['\\N']
409728,tt9916692,Teatroteka: Czlowiek bez twarzy,0,2015,66,Drama,"['Zbigniew Zamachowski', 'Andrzej Bartnikowski...","['actor', 'director', 'actor', 'actress', 'com...","['[""Authority""]', '\\N', '[""Rafal""]', '[""Monik..."
409729,tt9916730,6 Gunn,0,2017,116,\N,"['Sunil Barve', 'Kiran Gawade', 'Bhushan Pradh...","['actor', 'director', 'actor', 'actor', 'actor']","['\\N', '\\N', '\\N', '\\N', '\\N']"


## Filter to Include Only English Reviews

Analysis will only be done on english reviews, as later procedures such as lemmatization and topic modelling will be english-focused. 

Below, we wil filter the rows such that only the ones with english reviews are kept. 

In [7]:
from langdetect import detect_langs

def valid_english(row):
    """
    Checks if a review is in english (with at least 90% confidence).

    Parameters:
    row (Series): row which contains one review by a user for one movie 

    Returns:
    True if langdetect is at least 90% confident that the review is in english.
    False otherwise. 

   """
    try:
        # detect language(s) of review and their confidence
        lang_prob = detect_langs(row.review)
        lang, prob = str(lang_prob[0]).split(':')
        # return True if most probable langauge is english and has at least 90% confidence
        
        # TODO: cross validate here for probability?
        
        return lang == 'en' and prob > '0.9'
    # if the review does not contain letters, it will throw an exception and return False
    except:
        return False

# select only rows where the review is in valid english
english_reviews = reviews[reviews.apply(lambda row: valid_english(row), axis=1)]
english_reviews

Unnamed: 0,tconst,username,rating,review
0,tt0018671,F Gwynplaine MacIntyre,2/10,Even though I've written some historical ficti...
1,tt0018673,Maleejandra,8/10,"Bare Knees is the epitome of the Jazz Age, but..."
2,tt0018673,kidboots,9/10,Sometimes you can get a truer picture of teen ...
3,tt0018673,David-240,9/10,This must be the ultimate flapper comedy - an ...
4,tt0018673,JohnHowardReid,10/10,A comedy-drama I greatly enjoyed was 1928's Ba...
...,...,...,...,...
996338,tt0126816,poolandrews,3/10,Blackjack starts as former FED Jack Devlin (Do...
996339,tt0126816,Jack_Yan,1/10,I caught this on television after the opening ...
996340,tt0126816,Miss K,1/10,This is easily one of the WORST movies i have ...
996341,tt0126816,sutcal,3/10,I have read many of the users comments regardi...


## Clean Reviews & Movies

Currently, text in `english_reviews` and `movies` have uppercases, punctuation and digits. 

We want to fix this in the `review` column of `english_reviews`.

Additionally, we want to fix this in the `primaryTitle`, `genres`, `primaryName`, `category` and `characters` columns in `movies`. As the `genres`, `primaryName`, `category` and `characters` columns may consist of a list of values, we want to ensure that each item is separated by a `,`. The `characters` column in particular may have multiple character names per item (as each item in this list refers to one cast/crew member). These characters will be separated by a `/`. Some cast/crew members may not have a character associated, so for the corresponding item, the value in the list will be an empty string.

To more easily 

In [8]:
import string 
import re

def clean_text(text):
    """
    Converts text to lowercase and removes punctuation/digits. 

    Parameters:
    text (string): string which contains some unclean text

    Returns:
    Text that is in lowercase and removed punctuation/digits

   """
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation + '“”’—'))
    text = text.translate(str.maketrans('', '', string.digits))

    return text.strip()

def update_review_with_clean_text(row):
    """
    Updates the given row with clean text for its review column.

    Parameters:
    row (Series): row that contains unclean reviews

    Returns:
    Row with cleaned review. 

   """
    text = clean_text(row.review)
    row.review = text
    return row

def update_movie_with_clean_text(row):
    """
    Updates the given row with clean text for its primaryTitle, genres, primaryName, category 
    and character columns.

    Parameters:
    row (Series): row that contains unclean text

    Returns:
    Row with cleaned text for the aforementioned columns. 

   """
    # retrieve cleaned version of the title of the movie 
    row.primaryTitle = clean_text(row.primaryTitle)
    
    # if there are no genres given, put nan. else, retrieve genres in the form "romance,drama,tragedy"
    if row.genres == '\\N':
        row.genres = np.nan
    else:
        row.genres = ','.join([clean_text(g) for g in row.genres.split(',')])
    
    # retrieve cast & crew in the form "john doe,jane doe,bob smith"
    row.primaryName = ','.join([clean_text(n) for n in row.primaryName.split(',')])
    
    # retrieve jobs of the cast & crew in the form "director,actor,actress"
    row.category = ','.join([clean_text(n) for n in row.category.split(',')])
    
    cleaned_characters = []
    
    # will give the roles (character names) portrayed by each actor (in a list)
    # if a cast/crew member does not play a character role, gives an empty string
    actor_roles = re.findall('\["(.*?)"\]|\\\\N', row.characters[1:-1])

    for role in actor_roles:
        # retrieve characters portrayed by this actor in the form "lucy/annie/sophie"
        # takes into acct other characters (",") or alternate names ("()" or "-")
        cleaned_characters.append('/'.join([clean_text(char).strip() for r in role.split(',') for char in re.split(' - | \(', r)]))

    row.characters = ','.join(cleaned_characters)
        
    return row

In [10]:
clean_english_reviews = pd.DataFrame([update_review_with_clean_text(row) for i, row in english_reviews.iterrows()]) 
clean_english_reviews

KeyboardInterrupt: 

In [None]:
clean_movies = pd.DataFrame([update_movie_with_clean_text(row) for i, row in movies.iterrows()]) 
clean_movies

In [None]:
def create_mapping(tconst):
    """
    Creates a mapping between a word in this movie (corresponding to tconst) and its replacement. 
    
    For topic modelling purposes, it is preferred that the title of the movie, cast/crew names and character names
    be replaced with a generic word (either "movie" or that cast/crew/character's role in the movie.) By doing this, we 
    can more easily detect when a user is referring to an actor/actress's performance, the director's choices, etc. 

    Parameters:
    tconst (string): unique id of a movie.

    Returns:
    Mapping (dictionary) of a word and its replacement. 

   """
    # get the row that corresponds to tconst 
    movie = clean_movies[clean_movies.tconst == tconst]
    # get the title, cast/crew names, categories (actor, director, etc.) and character names from movie 
    title, cast_crew, categories, characters = get_movie_attributes(movie)
    
    # intially map each cast/crew member to their category 
    mapping = dict(zip(cast_crew, categories))
    
    # map the title of the movie to "movie"
    mapping[title] = 'movie'

    for i in range(len(cast_crew)):
        # map the last name of each cast/crew member to their category
        last_name = cast_crew[i].split()[-1]
        mapping[last_name] = categories[i]
        
        # an actor/actress may perform multiple roles, separated by a "/"
        # each character they play is mapped to the actor/actress's category 
        for c in characters[i].split('/'):
            if c:
                mapping[c] = categories[i]
    
    return mapping

def get_movie_attributes(movie):
    """
    Gives the title, list of cast & crew, list of categories of the cast & crew (e.g. director, actress, etc.) and 
    the characters each cast/crew member played. 
    
    If the cast/crew member does not play a character, then the list at that index will be an empty string. 
    An actor/actress may play several characters. If so, the character names will be separated by a "/" at the 
    same index.

    Parameters:
    movie (Series): row that contains the title of the movie, string containing all cast/crew members, string 
    containing their category and string containing what character they play.

    Returns:
    title of movie, list of cast and crew, list of categories, list of characters

   """
    title = movie.iloc[0, movie.columns.get_loc('primaryTitle')]
    cast_crew = movie.iloc[0, movie.columns.get_loc('primaryName')].split(',')
    categories = movie.iloc[0, movie.columns.get_loc('category')].split(',')
    characters = movie.iloc[0, movie.columns.get_loc('characters')].split(',')
    return title, cast_crew, categories, characters

def replace_cast_crew(row):
    """
    Replaces any mention of the title of the movie, cast & crew names and character names with their category (role).

    Parameters:
    row (Series): row that contains information on a movie and all of the users who have left a review. Columns 
    consist of a tconst (id of movie, and is the index), usernames list, ratings list and reviews list. Each item
    across all three lists correspond with each other.

    Returns:
    List of dictionaries containing the new rows with the replaced reviews. Each dictionary entry consists of only one
    user's review/rating for a movie corresponding with that tconst. 

   """    
    # create the mapping of replacement words 
    mapping = create_mapping(row.name)
    movie_reviews_list = []
    
    for i in range(len(row.username)):
        
        # for each user, replace the review 
        replaced_review = row.review[i]
        for word, replacement in mapping.items():
            replaced_review = replaced_review.replace(word, replacement)
            
        # append this entry of one user's replaced review to the movie_reviews_list
        entry = {'tconst': row.name, 'username': row.username[i], 'rating': row.rating[i], 'review': replaced_review}
        movie_reviews_list.append(entry)
    return movie_reviews_list

# group by each movie such that we only have to create the mapping once for each movie 
# all users, their ratings and reviews will be aggregated in a list for one movie 
grouped_reviews = clean_english_reviews.groupby('tconst').agg(list)

replaced_reviews = pd.DataFrame([rr for i, row in grouped_reviews.iterrows() for rr in replace_cast_crew(row)])
replaced_reviews

## Lemmatize Reviews and Movies

In [None]:
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

def wordnet_pos_tag(word):
    """
    Tags word with a "part of speech" (POS) tag. This is necessary because the lemmatization process needs 
    some context in order to get the lemma of a word. For example, a word that can be represented as both a noun
    and a verb may have different lemmas.
    
    Wordnet accepts only 4 different tags: adjective, noun, verb or adverb. 
    nltk.pos_tag() gives many different tags. These tags will be mapped according to the first letter of its tag
    to a wordnet tag. Other tags that do not fit into these 4 categories will by default be tagged as a noun. 

    Parameters:
    word (string): a word from a review.

    Returns:
    wordnet POS tag for that word 

   """
    # nltk.pos_tag will return a tag in the format: [('Dog', 'NN')]
    # we need to get the first letter of the tag, namely 'N' 
    tag = nltk.pos_tag([word])[0][1][0].upper()
    
    # define the four mappings of nltk pos tags to wordnet tags 
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    
    # return the correct mapping or, if unavailable, return noun by default 
    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize(text):
    """
    Lemmatizes the text. 

    Parameters:
    text (string): string with unlemmatized text.

    Returns:
    Lemmatized text.

   """    
    # tokenize the text, then for each token, lemmatize the token 
    words = [lemmatizer.lemmatize(w, wordnet_pos_tag(w)) for w in nltk.word_tokenize(text)]
    
    # join text back into a string 
    return ' '.join(words)

def lemmatize_row(row):
    """
    Updates the given row with lemmatized text for its review column.

    Parameters:
    row (Series): row that contains unlemmatized reviews

    Returns:
    Row with lemmatized review. 

   """  
    print(f'Processing row {row.name}...', end='\r')
    row.review = lemmatize(row.review)
    return row

lemmatizer = WordNetLemmatizer()
lemmatized_reviews = pd.DataFrame([lemmatize_row(row) for i, row in replaced_reviews.iterrows()])
print(f'Done processing all rows!', end='\r')
lemmatized_reviews

In [None]:
lemmatized_reviews.to_csv(f'./reviews/lemmatized_reviews/lemmatized_reviews_{number}.csv')

## Clean and Lemmatize Guide Documents

In [50]:
import glob

# get original filenames of guide docs 
guide_filenames = [file for file in glob.glob("./topics/raw_topics/*.txt")]

# read each file, append to list
guide_documents = []
for file in guide_filenames:
    with open(file, encoding='utf-8') as f:
        guide_documents.append(' '.join(f.read().splitlines()))
    
# lemmatize each document 
lemm_guide_docs = [lemmatize(clean_text(doc)) for doc in guide_documents]

# for each lemmatized document, write to new filepath
for i in range(len(lemm_guide_docs)):
    new_filepath = './topics/lemm_topics' + guide_filenames[i].split('raw_topics')[1]
    with open(new_filepath, 'w', encoding='utf-8') as f:
        f.write(lemm_guide_docs[i])

## Lemmatize Guide Words

In [637]:
import os 

basic_guide_words = [['acting', 'actor', 'actress', 'character', 'performance', 'convincing', 'multidimensional', 'authentic', 'portray'], # acting
                    ['attraction', 'premise', 'entertainment', 'interesting', 'pitch', 'amuse', 'enjoy', 'fun'], # attraction
                    ['cinematography', 'visual', 'lighting', 'setting', 'wardrobe', 'camera', 'angles', 'view', 'frame', 'shot', 'aesthetic'], # cinematography
                    ['dialogue', 'storytelling', 'context', 'story', 'monologue', 'speech', 'express'], # dialogue
                    ['directing', 'style', 'execution', 'vision', 'creativity', 'perfect'], # directing
                    ['editing', 'effects', 'tone', 'vfx', 'sfx', 'animation', 'cgi'], # editing and effects
                    ['original', 'innovative', 'best', 'beyond', 'amazing', 'memorable', 'unique', 'special', 'experience'], # it factor
                    ['plot', 'story', 'arc', 'plausibility', 'structure', 'world', 'pace'], # plot 
                    ['sound', 'music', 'harmony', 'mood', 'song', 'soul', 'volume', 'mix'], # sound and music
                    ['theme', 'identity', 'intrigue', 'message', 'powerful', 'meaning', 'emotional', 'thoughtful', 'bond']] # theme

# lemmatize all guide words 
lemm_basic_guide_words = [[lemmatizer.lemmatize(w, wordnet_pos_tag(w)) for w in r] for r in basic_guide_words]

# write lemmatized guide words to text file
lemm_guide_words_filepath = './topics/lemm_guide_words.txt'

if os.path.exists(lemm_guide_words_filepath):
    os.remove(lemm_guide_words_filepath)

with open(lemm_guide_words_filepath, 'a', encoding='utf-8') as f: 
    [f.write(' '.join(topic_words) + '\n') for topic_words in lemm_basic_guide_words]