In [2]:
import pandas as pd
import numpy as np

## Read Data

In this section, we read in two csvs that contain data we have webscraped and data we have retrieved and consolidated from https://www.imdb.com/interfaces/. 

The `reviews` dataframe consists of 4 columns:
* `tconst` - unique id of a movie 
* `username` - username of the user who left a rating/review for this movie
* `rating` - user's rating for the movie out of 10 (1 being the lowest, 10 being the highest) 
* `review` - user's textual review of the movie 

The `movies` dataframe consists of 9 columns:

 * `tconst` - unique id of a movie 
 * `primaryTitle` - title of the movie 
 * `isAdult` - 0 or 1, indicating whether a movie is rated for adults or not
 * `startYear` - release year of the movie 
 * `runtimeMinutes` - runtime of the movie, in minutes 
 * `genres` - genres of the movie
 * `primaryName` - names of prominent cast/crew members 
 * `category` - categories (roles) of cast/crew members specified in `primaryName`
 * `characters` - characters played by cast/crew members (if no character played by a member, then will be an empty string)

In [10]:
number = 5933
reviews = pd.read_csv(f'../data/reviews/raw_reviews/raw_reviews_{number}.csv')
reviews.head()

Unnamed: 0,tconst,username,rating,review
0,tt0000574,David-240,10/10,THE STORY OF THE KELLY GANG is believed to be ...
1,tt0000574,F Gwynplaine MacIntyre,10/10,"This afternoon at the Barbican, I attended the..."
2,tt0000574,ackstasis,9/10,'The Story of the Kelly Gang' is undoubtedly o...
3,tt0000574,Ziggy5446,10/10,The Story of the Kelly Gang (1906) symbolizes ...
4,tt0000574,Fella_shibby,8/10,This is the original n the first account of Ne...


In [9]:
movies = pd.read_csv('../movies.csv', index_col=0)
movies.head()

Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,primaryName,category,characters
0,tt0000009,Miss Jerry,0,1894,45,Romance,"['Blanche Bayliss', 'Alexander Black', 'Willia...","['actress', 'director', 'actor', 'actor']","['[""Miss Geraldine Holbrook (Miss Jerry)""]', '..."
1,tt0000502,Bohemios,0,1905,100,\N,"['Ricardo de Baños', 'Antonio del Pozo', 'El M...","['director', 'actor', 'actor']","['\\N', '\\N', '\\N']"
2,tt0000574,The Story of the Kelly Gang,0,1906,70,"Biography,Crime,Drama","['Bella Cola', 'Charles Tait', 'Elizabeth Tait...","['actress', 'director', 'actress', 'actor', 'c...","['\\N', '\\N', '[""Kate Kelly""]', '[""School Mas..."
3,tt0000679,The Fairylogue and Radio-Plays,0,1908,120,"Adventure,Fantasy","['L. Frank Baum', 'Francis Boggs', 'Frank Burn...","['actor', 'director', 'actor', 'composer', 'di...","['[""The Wizard of Oz Man""]', '\\N', '[""His Maj..."
4,tt0001184,Don Juan de Serrallonga,0,1910,58,"Adventure,Drama","['Ricardo de Baños', 'Alberto Marro', 'Dolores...","['director', 'director', 'actress', 'actor']","['\\N', '\\N', '\\N', '\\N']"


## Filter to Include Only English Reviews

Analysis will only be done on english reviews, as later procedures such as lemmatization and topic modelling will be english-focused. 

Below, we wil filter the rows such that only the ones with english reviews are kept. 

In [11]:
import shared_functions.cleaning as cleaning

# select only rows where the review is in valid english
english_reviews = reviews[reviews.apply(lambda row: cleaning.valid_english(row), axis=1)]
english_reviews

Unnamed: 0,tconst,username,rating,review
0,tt0000574,David-240,10/10,THE STORY OF THE KELLY GANG is believed to be ...
1,tt0000574,F Gwynplaine MacIntyre,10/10,"This afternoon at the Barbican, I attended the..."
2,tt0000574,ackstasis,9/10,'The Story of the Kelly Gang' is undoubtedly o...
3,tt0000574,Ziggy5446,10/10,The Story of the Kelly Gang (1906) symbolizes ...
4,tt0000574,Fella_shibby,8/10,This is the original n the first account of Ne...
...,...,...,...,...
11746,tt0018621,JohnHowardReid,8/10,Paramount groomed super-beautiful Lila Lee as ...
11747,tt0018621,kidboots,8/10,Lila Lee seemed to be in every other movie dur...
11748,tt0018621,F Gwynplaine MacIntyre,7/10,Lila Lee was an attractive and talented actres...
11749,tt0018638,cliffperriam,8/10,ALIAS JIMMY VALENTINE exists as a 4-reel silen...


## Clean Reviews & Movies

Currently, text in `english_reviews` and `movies` have uppercases, punctuation and digits. 

We want to fix this in the `review` column of `english_reviews`.

Additionally, we want to fix this in the `primaryTitle`, `genres`, `primaryName`, `category` and `characters` columns in `movies`. As the `genres`, `primaryName`, `category` and `characters` columns may consist of a list of values, we want to ensure that each item is separated by a `,`. The `characters` column in particular may have multiple character names per item (as each item in this list refers to one cast/crew member). These characters will be separated by a `/`. Some cast/crew members may not have a character associated, so for the corresponding item, the value in the list will be an empty string.

In [12]:
import re

def update_review_with_clean_text(row):
    """
    Updates the given row with clean text for its review column.

    Parameters:
    row (Series): row that contains unclean reviews

    Returns:
    Row with cleaned review. 

   """
    text = cleaning.clean_text(row.review)
    row.review = text
    return row

def update_movie_with_clean_text(row):
    """
    Updates the given row with clean text for its primaryTitle, genres, primaryName, category 
    and character columns.

    Parameters:
    row (Series): row that contains unclean text

    Returns:
    Row with cleaned text for the aforementioned columns. 

   """
    # retrieve cleaned version of the title of the movie 
    row.primaryTitle = cleaning.clean_text(row.primaryTitle)
    
    # if there are no genres given, put nan. else, retrieve genres in the form "romance,drama,tragedy"
    if row.genres == '\\N':
        row.genres = np.nan
    else:
        row.genres = ','.join([cleaning.clean_text(g) for g in row.genres.split(',')])
    
    # retrieve cast & crew in the form "john doe,jane doe,bob smith"
    row.primaryName = ','.join([cleaning.clean_text(n) for n in row.primaryName.split(',')])
    
    # retrieve jobs of the cast & crew in the form "director,actor,actress"
    row.category = ','.join([cleaning.clean_text(n) for n in row.category.split(',')])
    
    cleaned_characters = []
    
    # will give the roles (character names) portrayed by each actor (in a list)
    # if a cast/crew member does not play a character role, gives an empty string
    actor_roles = re.findall('\["(.*?)"\]|\\\\N', row.characters[1:-1])

    for role in actor_roles:
        # retrieve characters portrayed by this actor in the form "lucy/annie/sophie"
        # takes into acct other characters (",") or alternate names ("()" or "-")
        cleaned_characters.append('/'.join([cleaning.clean_text(char).strip() for r in role.split(',') for char in re.split(' - | \(', r)]))

    row.characters = ','.join(cleaned_characters)
        
    return row

In [13]:
clean_english_reviews = pd.DataFrame([update_review_with_clean_text(row) for i, row in english_reviews.iterrows()])
clean_english_reviews

Unnamed: 0,tconst,username,rating,review
0,tt0000574,David-240,10/10,the story of the kelly gang is believed to be ...
1,tt0000574,F Gwynplaine MacIntyre,10/10,this afternoon at the barbican i attended the ...
2,tt0000574,ackstasis,9/10,the story of the kelly gang is undoubtedly one...
3,tt0000574,Ziggy5446,10/10,the story of the kelly gang symbolizes both t...
4,tt0000574,Fella_shibby,8/10,this is the original n the first account of ne...
...,...,...,...,...
11746,tt0018621,JohnHowardReid,8/10,paramount groomed superbeautiful lila lee as a...
11747,tt0018621,kidboots,8/10,lila lee seemed to be in every other movie dur...
11748,tt0018621,F Gwynplaine MacIntyre,7/10,lila lee was an attractive and talented actres...
11749,tt0018638,cliffperriam,8/10,alias jimmy valentine exists as a reel silent ...


In [15]:
clean_movies = pd.DataFrame([update_movie_with_clean_text(row) for _, row in movies.iterrows()])
clean_movies

Unnamed: 0,tconst,primaryTitle,isAdult,startYear,runtimeMinutes,genres,primaryName,category,characters
0,tt0000009,miss jerry,0,1894,45,romance,"blanche bayliss,alexander black,william courte...","actress,director,actor,actor","miss geraldine holbrook/miss jerry,,mr hamilto..."
1,tt0000502,bohemios,0,1905,100,,"ricardo de baños,antonio del pozo,el mochuelo","director,actor,actor",",,"
2,tt0000574,the story of the kelly gang,0,1906,70,"biography,crime,drama","bella cola,charles tait,elizabeth tait,john ta...","actress,director,actress,actor,composer,actor",",,kate kelly,school master,,steve hart"
3,tt0000679,the fairylogue and radioplays,0,1908,120,"adventure,fantasy","l frank baum,francis boggs,frank burns,nathani...","actor,director,actor,composer,director,actor,a...","the wizard of oz man,,his majesty the scarecro..."
4,tt0001184,don juan de serrallonga,0,1910,58,"adventure,drama","ricardo de baños,alberto marro,dolores puchol,...","director,director,actress,actor",",,,"
...,...,...,...,...,...,...,...,...,...
409726,tt9916622,rodolpho teóphilo o legado de um pioneiro,0,2015,57,documentary,"angela gurgel,ana célia de oliveira,oldair soa...","director,director,actor",",,rodolpho teophilo"
409727,tt9916680,de la ilusión al desconcierto cine colombiano,0,2007,100,documentary,luis ospina,director,
409728,tt9916692,teatroteka czlowiek bez twarzy,0,2015,66,drama,"zbigniew zamachowski,andrzej bartnikowski,samb...","actor,director,actor,actress,composer,actress","authority,,rafal,monika,,bianka"
409729,tt9916730,gunn,0,2017,116,,"sunil barve,kiran gawade,bhushan pradhan,archi...","actor,director,actor,actor,actor",",,,,"


In [17]:
def create_mapping(tconst):
    """
    Creates a mapping between a word in this movie (corresponding to tconst) and its replacement. 
    
    For topic modelling purposes, it is preferred that the title of the movie, cast/crew names and character names
    be replaced with a generic word (either "movie" or that cast/crew/character's role in the movie.) By doing this, we 
    can more easily detect when a user is referring to an actor/actress's performance, the director's choices, etc. 

    Parameters:
    tconst (string): unique id of a movie.

    Returns:
    Mapping (dictionary) of a word and its replacement. 

   """
    # get the row that corresponds to tconst 
    movie = clean_movies[clean_movies.tconst == tconst]
    # get the title, cast/crew names, categories (actor, director, etc.) and character names from movie 
    title, cast_crew, categories, characters = get_movie_attributes(movie)
    
    # intially map each cast/crew member to their category 
    mapping = dict(zip(cast_crew, categories))
    
    # map the title of the movie to "movie"
    mapping[title] = 'movie'

    for i in range(len(cast_crew)):
        # map the last name of each cast/crew member to their category
        last_name = cast_crew[i].split()[-1]
        mapping[last_name] = categories[i]
        
        # an actor/actress may perform multiple roles, separated by a "/"
        # each character they play is mapped to the actor/actress's category 
        for c in characters[i].split('/'):
            if c:
                mapping[c] = categories[i]
    
    return mapping

def get_movie_attributes(movie):
    """
    Gives the title, list of cast & crew, list of categories of the cast & crew (e.g. director, actress, etc.) and 
    the characters each cast/crew member played. 
    
    If the cast/crew member does not play a character, then the list at that index will be an empty string. 
    An actor/actress may play several characters. If so, the character names will be separated by a "/" at the 
    same index.

    Parameters:
    movie (Series): row that contains the title of the movie, string containing all cast/crew members, string 
    containing their category and string containing what character they play.

    Returns:
    title of movie, list of cast and crew, list of categories, list of characters

   """
    title = movie.iloc[0, movie.columns.get_loc('primaryTitle')]
    cast_crew = movie.iloc[0, movie.columns.get_loc('primaryName')].split(',')
    categories = movie.iloc[0, movie.columns.get_loc('category')].split(',')
    characters = movie.iloc[0, movie.columns.get_loc('characters')].split(',')
    return title, cast_crew, categories, characters

def replace_cast_crew(row):
    """
    Replaces any mention of the title of the movie, cast & crew names and character names with their category (role).

    Parameters:
    row (Series): row that contains information on a movie and all of the users who have left a review. Columns 
    consist of a tconst (id of movie, and is the index), usernames list, ratings list and reviews list. Each item
    across all three lists correspond with each other.

    Returns:
    List of dictionaries containing the new rows with the replaced reviews. Each dictionary entry consists of only one
    user's review/rating for a movie corresponding with that tconst. 

   """    
    # create the mapping of replacement words 
    mapping = create_mapping(row.name)
    movie_reviews_list = []
    
    for i in range(len(row.username)):
        
        # for each user, replace the review 
        replaced_review = row.review[i]
        for word, replacement in mapping.items():
            replaced_review = replaced_review.replace(word, replacement)
            
        # append this entry of one user's replaced review to the movie_reviews_list
        entry = {'tconst': row.name, 'username': row.username[i], 'rating': row.rating[i], 'review': replaced_review}
        movie_reviews_list.append(entry)
    return movie_reviews_list

# group by each movie such that we only have to create the mapping once for each movie 
# all users, their ratings and reviews will be aggregated in a list for one movie 
grouped_reviews = clean_english_reviews.groupby('tconst').agg(list)

replaced_reviews = pd.DataFrame([rr for _, row in grouped_reviews.iterrows() for rr in replace_cast_crew(row)])
replaced_reviews

Unnamed: 0,tconst,username,rating,review
0,tt0000574,David-240,10/10,movie is believed to be the worlds first featu...
1,tt0000574,F Gwynplaine MacIntyre,10/10,this afternoon at the barbican i attended the ...
2,tt0000574,ackstasis,9/10,movie is undoubtedly one of the cinemas most s...
3,tt0000574,Ziggy5446,10/10,movie symbolizes both the birth of the austra...
4,tt0000574,Fella_shibby,8/10,this is the original n the first account of ne...
...,...,...,...,...
11743,tt0018621,JohnHowardReid,8/10,paramount groomed superbeautiful actress as a ...
11744,tt0018621,kidboots,8/10,actress seemed to be in every other movie duri...
11745,tt0018621,F Gwynplaine MacIntyre,7/10,actress was an attractive and talented actress...
11746,tt0018638,cliffperriam,8/10,movie exists as a reel silent french mm releas...


## Lemmatize Reviews and Movies

In [18]:
from nltk.stem import WordNetLemmatizer

def lemmatize_row(row):
    """
    Updates the given row with lemmatized text for its review column.

    Parameters:
    row (Series): row that contains unlemmatized reviews

    Returns:
    Row with lemmatized review. 

   """  
    print(f'Processing row {row.name}...', end='\r')
    row.review = cleaning.lemmatize(row.review, lemmatizer)
    return row

lemmatizer = WordNetLemmatizer()
lemmatized_reviews = pd.DataFrame([lemmatize_row(row) for _, row in replaced_reviews.iterrows()])
print(f'Done processing all rows!', end='\r')
lemmatized_reviews

Done processing all rows!

Unnamed: 0,tconst,username,rating,review
0,tt0000574,David-240,10/10,movie be believe to be the world first feature...
1,tt0000574,F Gwynplaine MacIntyre,10/10,this afternoon at the barbican i attend the uk...
2,tt0000574,ackstasis,9/10,movie be undoubtedly one of the cinema most si...
3,tt0000574,Ziggy5446,10/10,movie symbolizes both the birth of the austral...
4,tt0000574,Fella_shibby,8/10,this be the original n the first account of ne...
...,...,...,...,...
11743,tt0018621,JohnHowardReid,8/10,paramount groom superbeautiful actress a a rep...
11744,tt0018621,kidboots,8/10,actress seem to be in every other movie during...
11745,tt0018621,F Gwynplaine MacIntyre,7/10,actress be an attractive and talented actress ...
11746,tt0018638,cliffperriam,8/10,movie exists a a reel silent french mm release...


In [None]:
lemmatized_reviews.to_csv(f'./reviews/lemmatized_reviews/lemmatized_reviews_{number}.csv')