In [61]:
import os
import sys
import pandas as pd
import re
import ast
import contractions
import nltk
from nltk.corpus import names
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import numpy as np
from datetime import datetime

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('names')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\MULTIVISION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MULTIVISION\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\MULTIVISION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\MULTIVISION\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package names to
[nltk_data]     C:\Users\MULTIVISION\AppData\Roaming\nltk_data...
[nltk_data]   Package names is already up-to-date!


True

# Data Collection

Source: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset/data

In [27]:
# Get data directories
current_dir = os.getcwd()
parent_dir = os.path.dirname(current_dir)
sys.path.append(parent_dir)

movies_metadata_path = '/data/raw/movies_metadata.csv'
keywords_path = '/data/raw/keywords.csv'

In [28]:
# Read data
try:
    df = pd.read_csv(parent_dir + movies_metadata_path, engine='python', on_bad_lines='skip')
    keywords_df = pd.read_csv(parent_dir + keywords_path, engine='python', on_bad_lines='skip')
except pd.errors.ParserError as e:
    print(f"ParserError: {e}")

print('MOVIES_METADATA')
print(df.iloc[0])
print('\nKEYWORDS')
print(keywords_df.iloc[0])

MOVIES_METADATA
adult                                                                False
belongs_to_collection    {'id': 10194, 'name': 'Toy Story Collection', ...
budget                                                            30000000
genres                   [{'id': 16, 'name': 'Animation'}, {'id': 35, '...
homepage                              http://toystory.disney.com/toy-story
id                                                                     862
imdb_id                                                          tt0114709
original_language                                                       en
original_title                                                   Toy Story
overview                 Led by Woody, Andy's toys live happily in his ...
popularity                                                       21.946943
poster_path                               /rhIRbceoE9lR4veEXuwCC2wARtG.jpg
production_companies        [{'name': 'Pixar Animation Studios', 'id': 3}]
productio

In [29]:
df.describe()

Unnamed: 0,revenue,runtime,vote_average,vote_count
count,45460.0,45203.0,45460.0,45460.0
mean,11209350.0,94.128199,5.618207,109.897338
std,64332250.0,38.40781,1.924216,491.310374
min,0.0,0.0,0.0,0.0
25%,0.0,85.0,5.0,3.0
50%,0.0,95.0,6.0,10.0
75%,0.0,107.0,6.8,34.0
max,2787965000.0,1256.0,10.0,14075.0


In [30]:
df = df[['id', 'title', 'overview', 'genres', 'popularity', 'release_date', 'tagline', 'vote_average']]
df.head()

Unnamed: 0,id,title,overview,genres,popularity,release_date,tagline,vote_average
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",21.946943,1995-10-30,,7.7
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",17.015539,1995-12-15,Roll the dice and unleash the excitement!,6.9
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",11.7129,1995-12-22,Still Yelling. Still Fighting. Still Ready for...,6.5
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",3.859495,1995-12-22,Friends are the people who let you be yourself...,6.1
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",8.387519,1995-02-10,Just When His World Is Back To Normal... He's ...,5.7


In [31]:
df = df.dropna(subset=['title'])
df = df.dropna(subset=['overview', 'tagline'], how='all')

df.isnull().sum()

id                  0
title               0
overview            8
genres              0
popularity          0
release_date       71
tagline         24102
vote_average        0
dtype: int64

In [32]:
df.dtypes

id               object
title            object
overview         object
genres           object
popularity       object
release_date     object
tagline          object
vote_average    float64
dtype: object

# Pre-Processing Numeric Data

In [33]:
# Convert id to Int64
df['id'] = df['id'].astype(pd.Int64Dtype())

# Convert popularity to Float64
df['popularity'] = df['popularity'].astype(pd.Float64Dtype())

# Extract release_year and convert to Int64
df['release_year'] = df['release_date'].apply(lambda cell: int(cell[0:4]) if pd.notna(cell) and len(str(cell)) == 10 else None)
df['release_year'] = df['release_year'].astype(pd.Float64Dtype())

# Drop release_date column
if 'release_date' in df.columns:
    df = df.drop(columns='release_date')

num_cols = ['popularity', 'release_year', 'vote_average']

# Impute none values with mean
for col in num_cols:
    df[col].fillna(df[col].mean(), inplace=True)

df[num_cols].isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mean(), inplace=True)


popularity      0
release_year    0
vote_average    0
dtype: int64

In [34]:
df[num_cols].describe()

Unnamed: 0,popularity,release_year,vote_average
count,44514.0,44514.0,44514.0
mean,2.971567,1991.798393,5.638813
std,6.057626,24.118876,1.896881
min,0.0,1874.0,0.0
25%,0.404121,1978.0,5.0
50%,1.159858,2001.0,6.0
75%,3.801981,2010.0,6.8
max,547.488298,2020.0,10.0


### Merge datasets

In [35]:
df = pd.merge(df, keywords_df, on='id', how='left')
df.head()

Unnamed: 0,id,title,overview,genres,popularity,tagline,vote_average,release_year,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",21.946943,,7.7,1995.0,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",17.015539,Roll the dice and unleash the excitement!,6.9,1995.0,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",11.7129,Still Yelling. Still Fighting. Still Ready for...,6.5,1995.0,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",3.859495,Friends are the people who let you be yourself...,6.1,1995.0,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,"[{'id': 35, 'name': 'Comedy'}]",8.387519,Just When His World Is Back To Normal... He's ...,5.7,1995.0,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


# Text Wrangling

In [36]:
# Function to extract names
def extract_names(cell):
    if isinstance(cell, str):
        # Convert string representation of list to actual list
        cell = ast.literal_eval(cell)
    if isinstance(cell, list):
        return [item['name'] for item in cell if isinstance(item, dict) and 'name' in item]
    return []

# Extract 'name' in json
df['genres'] = df['genres'].apply(extract_names)
df['keywords'] = df['keywords'].apply(extract_names)
df.head()

Unnamed: 0,id,title,overview,genres,popularity,tagline,vote_average,release_year,keywords
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ...","[Animation, Comedy, Family]",21.946943,,7.7,1995.0,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...,"[Adventure, Fantasy, Family]",17.015539,Roll the dice and unleash the excitement!,6.9,1995.0,"[board game, disappearance, based on children'..."
2,15602,Grumpier Old Men,A family wedding reignites the ancient feud be...,"[Romance, Comedy]",11.7129,Still Yelling. Still Fighting. Still Ready for...,6.5,1995.0,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...","[Comedy, Drama, Romance]",3.859495,Friends are the people who let you be yourself...,6.1,1995.0,"[based on novel, interracial relationship, sin..."
4,11862,Father of the Bride Part II,Just when George Banks has recovered from his ...,[Comedy],8.387519,Just When His World Is Back To Normal... He's ...,5.7,1995.0,"[baby, midlife crisis, confidence, aging, daug..."


In [37]:
# Convert None values into ' '
df['tagline'] = df['tagline'].apply(lambda x: '' if pd.isna(x) else ' ' + x)

# Concatenate overview and tagline
df['overview'] = df['overview'] + ' ' + df['tagline'] #.astype(str)
df = df.drop(columns='tagline')

# add title_wrangled
df['title_wrangled'] = df['title']

In [38]:
df.isnull().sum()

id                0
title             0
overview          9
genres            0
popularity        0
vote_average      0
release_year      0
keywords          0
title_wrangled    0
dtype: int64

In [39]:
cols_to_wrangle = ['overview', 'keywords', 'genres', 'title_wrangled']

### Fix contractions and possessive apostrophes

Make conversions
 ("ain't", 'are not'),
 ("aren't", 'are not'),
 ("can't", 'can not'),
 ("'cause", 'because'), ...


Example:

  id: 54, overview: "Sadie looks up to her older sister Georgia, a successful folk singer who's happily married with children, but **can't** break out of the bar-band circuit and hit the big time she desperately covets. **It's** in part due to her attraction to drugs and booze, and also to her own unwise choice in men. Finally, though, **Sadie's** Achilles heel is a rough, unlovely voice very different than her **sister's** crowd-pleasing singing."

In [40]:
df.dtypes

id                  Int64
title              object
overview           object
genres             object
popularity        Float64
vote_average      float64
release_year      Float64
keywords           object
title_wrangled     object
dtype: object

In [41]:
# Function definition (assuming you have contractions and re imported and defined)
def fix_contractions_and_possessives(cell):
    if isinstance(cell, str):
        if pd.notna(cell):
            try:
                # Expand contractions
                text = contractions.fix(cell)
            except Exception as e:
                print('Warning: Could not fix contractions', e, text)
                text = cell

            try:
                # Remove possessive apostrophes (e.g., "sister's" -> "sister")
                if re.search(r"(\b\w+)'s\b", text):
                    text = re.sub(r"(\b\w+)'s\b", r"\1", text)
            except Exception as e:
                print('Warning: Could not remove possessive apostrophes', e, text)
                text = cell
            return text

    elif isinstance(cell, list):
        list_of_words = cell
        list_of_words = [contractions.fix(text) for text in list_of_words]
        return [re.sub(r"(\b\w+)'s\b", r"\1", text) for text in list_of_words]

    else:
        return cell

for col in cols_to_wrangle:
  df[col] = df[col].apply(fix_contractions_and_possessives)

In [42]:
df['overview'].iloc[54]

'Set in modern times, Alex finds King Arthur sword Excalibur and must prove himself worthy of it. '

In [43]:
# Function to lower case words
def lower_case(cell):
    if isinstance(cell, str):
      return cell.lower()
    elif isinstance(cell, list):
      list_of_words = cell
      return [text.lower() for text in list_of_words]
    else:
      return cell

for col in cols_to_wrangle:
  df[col] = df[col].apply(lower_case)

In [44]:
df.head()

Unnamed: 0,id,title,overview,genres,popularity,vote_average,release_year,keywords,title_wrangled
0,862,Toy Story,"led by woody, andy toys live happily in his ro...","[animation, comedy, family]",21.946943,7.7,1995.0,"[jealousy, toy, boy, friendship, friends, riva...",toy story
1,8844,Jumanji,when siblings judy and peter discover an encha...,"[adventure, fantasy, family]",17.015539,6.9,1995.0,"[board game, disappearance, based on children ...",jumanji
2,15602,Grumpier Old Men,a family wedding reignites the ancient feud be...,"[romance, comedy]",11.7129,6.5,1995.0,"[fishing, best friend, duringcreditsstinger, o...",grumpier old men
3,31357,Waiting to Exhale,"cheated on, mistreated and stepped on, the wom...","[comedy, drama, romance]",3.859495,6.1,1995.0,"[based on novel, interracial relationship, sin...",waiting to exhale
4,11862,Father of the Bride Part II,just when george banks has recovered from his ...,[comedy],8.387519,5.7,1995.0,"[baby, midlife crisis, confidence, aging, daug...",father of the bride part ii


In [45]:
df['overview'].iloc[54]

'set in modern times, alex finds king arthur sword excalibur and must prove himself worthy of it. '

In [46]:
# Function to remove special characters
def remove_special_characters(cell, remove_digits=False):
    # Define the regex pattern to remove non-alphanumeric characters (preserve hyphen in certain cases)
    pattern = r'[^a-zA-Z0-9\s-]' if not remove_digits else r'[^a-zA-Z\s-]'

    if isinstance(cell, str):
      # Substitute the pattern with an empty string in the text
      text = re.sub(pattern, '', cell)

      # Handle specific exceptions like "crowd-pleasing" becoming "crowd pleasing"
      return re.sub(r'\b(\w+)-(\w+)\b', r'\1 \2', text)

    elif isinstance(cell, list):
      words_list = cell

      # Substitute the pattern with an empty string in the text
      text = [re.sub(pattern, '', cell) for cell in words_list]

      # Handle specific exceptions like "crowd-pleasing" becoming "crowd pleasing"
      return [re.sub(r'\b(\w+)-(\w+)\b', r'\1 \2', text) for text in words_list]

    else:
      return cell

for col in cols_to_wrangle:
  df[col] = df[col].apply(lambda cell: remove_special_characters(cell, remove_digits=True))

In [47]:
df.head()

Unnamed: 0,id,title,overview,genres,popularity,vote_average,release_year,keywords,title_wrangled
0,862,Toy Story,led by woody andy toys live happily in his roo...,"[animation, comedy, family]",21.946943,7.7,1995.0,"[jealousy, toy, boy, friendship, friends, riva...",toy story
1,8844,Jumanji,when siblings judy and peter discover an encha...,"[adventure, fantasy, family]",17.015539,6.9,1995.0,"[board game, disappearance, based on children ...",jumanji
2,15602,Grumpier Old Men,a family wedding reignites the ancient feud be...,"[romance, comedy]",11.7129,6.5,1995.0,"[fishing, best friend, duringcreditsstinger, o...",grumpier old men
3,31357,Waiting to Exhale,cheated on mistreated and stepped on the women...,"[comedy, drama, romance]",3.859495,6.1,1995.0,"[based on novel, interracial relationship, sin...",waiting to exhale
4,11862,Father of the Bride Part II,just when george banks has recovered from his ...,[comedy],8.387519,5.7,1995.0,"[baby, midlife crisis, confidence, aging, daug...",father of the bride part ii


In [48]:
df['overview'].iloc[54]

'set in modern times alex finds king arthur sword excalibur and must prove himself worthy of it '

### Tokenize

In [49]:
# Function to safely tokenize words
def safe_word_tokenize(text):
    if isinstance(text, str):
      return word_tokenize(text)
    else:
      return text

for col in cols_to_wrangle:
    df[col] = df[col].apply(safe_word_tokenize)

In [50]:
df.head()

Unnamed: 0,id,title,overview,genres,popularity,vote_average,release_year,keywords,title_wrangled
0,862,Toy Story,"[led, by, woody, andy, toys, live, happily, in...","[animation, comedy, family]",21.946943,7.7,1995.0,"[jealousy, toy, boy, friendship, friends, riva...","[toy, story]"
1,8844,Jumanji,"[when, siblings, judy, and, peter, discover, a...","[adventure, fantasy, family]",17.015539,6.9,1995.0,"[board game, disappearance, based on children ...",[jumanji]
2,15602,Grumpier Old Men,"[a, family, wedding, reignites, the, ancient, ...","[romance, comedy]",11.7129,6.5,1995.0,"[fishing, best friend, duringcreditsstinger, o...","[grumpier, old, men]"
3,31357,Waiting to Exhale,"[cheated, on, mistreated, and, stepped, on, th...","[comedy, drama, romance]",3.859495,6.1,1995.0,"[based on novel, interracial relationship, sin...","[waiting, to, exhale]"
4,11862,Father of the Bride Part II,"[just, when, george, banks, has, recovered, fr...",[comedy],8.387519,5.7,1995.0,"[baby, midlife crisis, confidence, aging, daug...","[father, of, the, bride, part, ii]"


### Stopword Removal
Removing words that are present in a list of unmeaningfull words like
'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', ... , or a person's name.

In [51]:
stopwords = nltk.corpus.stopwords.words('english')
print(stopwords[0:20])

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his']


In [52]:
# Get lists of male and female names
male_names = [name.lower() for name in names.words('male.txt')]
female_names = [name.lower() for name in names.words('female.txt')]

# add names to stopwords list
stopwords.extend(male_names)
stopwords.extend(female_names)

In [53]:
# Function to remove stopwords
def remove_stopwords(words, stopwords):
    try:
        if isinstance(words, list):
            words = [w.strip() for w in words if w.lower() not in stopwords]
        else:
            print("Warning: The input is not a list:", words)
    except Exception as e:
        print('Error:', e, 'Words:', words)
    return words

# Apply the function to each column in cols_to_wrangle
for col in cols_to_wrangle:
    df[col] = df[col].apply(lambda cell: remove_stopwords(cell, stopwords))



In [54]:
df.head()

Unnamed: 0,id,title,overview,genres,popularity,vote_average,release_year,keywords,title_wrangled
0,862,Toy Story,"[led, toys, live, happily, room, birthday, bri...","[animation, comedy, family]",21.946943,7.7,1995.0,"[jealousy, toy, boy, friendship, friends, riva...","[toy, story]"
1,8844,Jumanji,"[siblings, discover, enchanted, board, game, o...","[adventure, fantasy, family]",17.015539,6.9,1995.0,"[board game, disappearance, based on children ...",[jumanji]
2,15602,Grumpier Old Men,"[family, wedding, reignites, ancient, feud, ne...","[romance, comedy]",11.7129,6.5,1995.0,"[fishing, best friend, duringcreditsstinger, o...","[grumpier, old, men]"
3,31357,Waiting to Exhale,"[cheated, mistreated, stepped, women, holding,...","[comedy, drama, romance]",3.859495,6.1,1995.0,"[based on novel, interracial relationship, sin...","[waiting, exhale]"
4,11862,Father of the Bride Part II,"[banks, recovered, daughter, wedding, receives...",[comedy],8.387519,5.7,1995.0,"[baby, midlife crisis, confidence, aging, daug...","[father, part, ii]"


### Stemming

In [55]:
# Function for stemming words
def stem_words(word_list):
    ps = PorterStemmer()
    if isinstance(word_list, list):
        try:
            return [ps.stem(word) for word in word_list]
        except Exception as e:
            print('error: ', e, word_list)
            return word_list
    return word_list

for col in cols_to_wrangle:
    df[col] = df[col].apply(stem_words)

In [56]:
df.head()

Unnamed: 0,id,title,overview,genres,popularity,vote_average,release_year,keywords,title_wrangled
0,862,Toy Story,"[led, toy, live, happili, room, birthday, brin...","[anim, comedi, famili]",21.946943,7.7,1995.0,"[jealousi, toy, boy, friendship, friend, rival...","[toy, stori]"
1,8844,Jumanji,"[sibl, discov, enchant, board, game, open, doo...","[adventur, fantasi, famili]",17.015539,6.9,1995.0,"[board gam, disappear, based on children book,...",[jumanji]
2,15602,Grumpier Old Men,"[famili, wed, reignit, ancient, feud, next, do...","[romanc, comedi]",11.7129,6.5,1995.0,"[fish, best friend, duringcreditssting, old men]","[grumpier, old, men]"
3,31357,Waiting to Exhale,"[cheat, mistreat, step, women, hold, breath, w...","[comedi, drama, romanc]",3.859495,6.1,1995.0,"[based on novel, interracial relationship, sin...","[wait, exhal]"
4,11862,Father of the Bride Part II,"[bank, recov, daughter, wed, receiv, news, pre...",[comedi],8.387519,5.7,1995.0,"[babi, midlife crisi, confid, age, daughter, m...","[father, part, ii]"


In [57]:
print(df['overview'].iloc[54])

['set', 'modern', 'time', 'find', 'sword', 'excalibur', 'must', 'prove']


In [63]:
# Get today's date
today = datetime.today()

# Format the date as YYYY-MM-DD
formatted_date = today.strftime('%Y-%m-%d')

# Save cleaned movie data
df.to_csv(parent_dir + f'/data/processed/movies_cleaned_data_{formatted_date}.csv')