In [1]:
#import dependencies
import pandas as pd
import numpy as np
import json
import re

In [2]:
file_dir="Data/"

In [3]:
with open(f'{file_dir}wikipedia-movies.json', mode='r') as file:
        wiki_movies_raw=json.load(file)

In [4]:
len(wiki_movies_raw)

7311

In [5]:
kaggle_metadata=pd.read_csv(f'{file_dir}movies_metadata.csv', low_memory=False)
ratings=pd.read_csv(f'{file_dir}ratings.csv')

In [6]:
kaggle_metadata[kaggle_metadata["original_language"]=='en'].sample(n=5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
45372,False,,0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",,42247,tt0478737,en,Treasure Raiders,"Michael, an American professor teaching histor...",...,2007-04-20,0.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Speed is part of the game...,Treasure Raiders,False,2.2,3.0
15598,False,,0,"[{'id': 12, 'name': 'Adventure'}, {'id': 18, '...",,83360,tt0032850,en,North West Mounted Police,"Texas Ranger Dusty Rivers (""Isn't that a contr...",...,1940-10-22,0.0,126.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,DeMille's first film in Technicolor! 10 Great ...,North West Mounted Police,False,6.1,4.0
21008,False,,0,"[{'id': 99, 'name': 'Documentary'}]",http://www.withgreatpower.biz/,86843,tt1091863,en,With Great Power: The Stan Lee Story,"At 89 years old, Stan Lee's name appears on mo...",...,2010-07-24,0.0,80.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Incredible Story Behind the REAL Supehero,With Great Power: The Stan Lee Story,False,5.3,6.0
5487,False,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 10749, '...",,44635,tt0245479,en,Just a Kiss,A group of thirtysomethings having problems wi...,...,2002-06-15,0.0,89.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Every kiss has a story.,Just a Kiss,False,0.0,0.0
17499,False,,0,[],http://www.reparandomovie.com,61724,tt1664825,en,Reparando,"On the morning of June 18. 1954, the US CIA dr...",...,2010-01-01,0.0,0.0,[],Released,,Reparando,False,7.0,2.0


In [7]:
wiki_movies_df = pd.DataFrame(wiki_movies_raw)
len(wiki_movies_df)

7311

In [8]:
wiki_movies = [movie for movie in wiki_movies_raw
              if ('Director' in movie or 'Directed by' in movie) 
                and 'imdb_link' in movie]

In [9]:
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    # combine alternate titles into one list
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune-Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles

    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')

    return movie

In [10]:
clean_movies = [clean_movie(movie) for movie in wiki_movies]
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

['Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Camera setup',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'McCune–Reischauer',
 'Narrated by',
 'No. of episodes',
 'Opening theme',
 'Original language(s)',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Recorded',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Venue',
 'Voices of',
 'Writer(s)',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [11]:
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')

In [12]:
[[column,wiki_movies_df[column].isna().sum()] for column in wiki_movies_df]

[['url', 0],
 ['year', 0],
 ['imdb_link', 0],
 ['title', 1],
 ['Based on', 4889],
 ['Starring', 185],
 ['Narrated by', 6797],
 ['Cinematography', 700],
 ['Release date', 32],
 ['Running time', 140],
 ['Country', 239],
 ['Language', 248],
 ['Budget', 2315],
 ['Box office', 1563],
 ['Director', 0],
 ['Distributor', 362],
 ['Editor(s)', 551],
 ['Composer(s)', 520],
 ['Producer(s)', 203],
 ['Production company(s)', 1688],
 ['Writer(s)', 201],
 ['Genre', 6966],
 ['Original language(s)', 6918],
 ['Original network', 6951],
 ['Executive producer(s)', 6980],
 ['Production location(s)', 7030],
 ['Picture format', 7012],
 ['Audio format', 7016],
 ['Voices of', 7078],
 ['Followed by', 7071],
 ['Created by', 7069],
 ['Opening theme', 7079],
 ['No. of episodes', 7076],
 ['alt_titles', 7058],
 ['Preceded by', 7070],
 ['Suggested by', 7079],
 ['Recorded', 7078],
 ['Venue', 7079],
 ['Label', 7078],
 ['Animation by', 7078],
 ['Color process', 7079],
 ['Camera setup', 7079],
 ['McCune–Reischauer', 7078]

In [13]:
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]

In [14]:
wiki_movies_df.dtypes

url                      object
year                      int64
imdb_link                object
title                    object
Based on                 object
Starring                 object
Cinematography           object
Release date             object
Running time             object
Country                  object
Language                 object
Budget                   object
Box office               object
Director                 object
Distributor              object
Editor(s)                object
Composer(s)              object
Producer(s)              object
Production company(s)    object
Writer(s)                object
imdb_id                  object
dtype: object

In [15]:
box_office = wiki_movies_df['Box office'].dropna()

In [16]:
box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)
box_office

0          $21.4 million
1           $2.7 million
2            $57,718,089
3             $7,331,647
4       $6,939,946 (USA)
              ...       
7074       $19.4 million
7075       $41.9 million
7076       $76.1 million
7077       $38.4 million
7078        $5.5 million
Name: Box office, Length: 5517, dtype: object

In [17]:
form_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)'

box_office.str.contains(form_one, flags=re.IGNORECASE, na=False).sum()

3920

In [18]:
box_office.str.contains(form_two, flags=re.IGNORECASE, na=False).sum()

1574

In [19]:
box_office = box_office.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [20]:
matches_form_one=box_office.str.contains(form_one, flags=re.IGNORECASE, na=False)
matches_form_two=box_office.str.contains(form_two, flags=re.IGNORECASE, na=False)
box_office[~matches_form_one & ~matches_form_two]

602                      $5000 (US)
1072                     35,254,617
1483                     £3 million
1868                   ¥1.1 billion
2036                            N/A
2095                           $309
2669    926,423 admissions (France)
3635                            TBA
3883       CN¥3.650 million (China)
4120                     £7,385,434
4310                            $30
4565        $45.2k (only in Turkey)
5451                          £2.56
5788                       413 733$
6017                        Unknown
6373                          $111k
6374                           $588
6597                 less than $372
6847                        8 crore
Name: Box office, dtype: object

In [21]:
box_office.str.extract(f'({form_one}|{form_two})')

Unnamed: 0,0
0,$21.4 million
1,$2.7 million
2,"$57,718,089"
3,"$7,331,647"
4,"$6,939,946"
...,...
7074,$19.4 million
7075,$41.9 million
7076,$76.1 million
7077,$38.4 million


In [22]:
def parse_dollars(s):

    # if s is not a string, return NaN
    if type(s) != str:
        return np.nan
    
    # if input is of the form $###.# million
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):
    
        # remove dollar sign and " million"
        s = re.sub('\$|\s*|[a-zA-Z]','',s)
        
        # convert to float and multiply by a million
        value= float(s)*10**6
        
        # return value
        return value
        
    # if input is of the form $###.# billion
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):
    
        # remove dollar sign and " billion"
        s = re.sub(r'\$|\s*|[a-zA-Z]','',s)
        
        # convert to float and multiply by a billion
        value=float(s)*10**9
        
        # return value
        return value
        
    # if input is of the form $###,###,###
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)', s, flags=re.IGNORECASE):
        
        # remove dollar sign and commas
        s = re.sub('\$|,','', s)
        
        # convert to float
        value=float(s)
        
        # return value
        return value
        
    # otherwise, return NaN
    else:
        return np.nan
    

In [23]:
wiki_movies_df['box_office'] = box_office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
wiki_movies_df['box_office']

0       21400000.0
1        2700000.0
2       57718089.0
3        7331647.0
4        6939946.0
           ...    
7075    41900000.0
7076    76100000.0
7077    38400000.0
7078     5500000.0
7079           NaN
Name: box_office, Length: 7080, dtype: float64

In [24]:
wiki_movies_df.drop('Box office', axis=1, inplace=True)

In [25]:
budget=wiki_movies_df['Budget'].dropna()
budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)

In [26]:
budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [27]:
matches_form_one=budget.str.contains(form_one, flags=re.IGNORECASE, na=False)
matches_form_two=budget.str.contains(form_two, flags=re.IGNORECASE, na=False)
budget[~matches_form_one & ~matches_form_two]

137                         Unknown
205     60 million Norwegian Kroner
479                         Unknown
622     60 million Norwegian Kroner
975             $34 [3] [4] million
1129               $120 [4] million
1229                        Unknown
1281                            HBO
1377                     £6,000,000
1400                     13 million
1483                   £2.8 million
1737                   CAD2,000,000
1916     PHP 85 million (estimated)
1951                    102,888,900
1956                   3,500,000 DM
1976                     ₤2,300,874
2285                     $14 milion
2455                     ₤6,350,000
3148                   € 40 million
3364               $150 [6] million
3422                        $218.32
3806                   £4.2 million
3910                            N/A
3963                    760,000 USD
4474                       19 crore
4645                    £17 million
5038              $$200 [4] million
5059           $155 [2] [3] 

In [28]:
budget = budget.str.replace(r'\[\d+\]\s*', '')
budget[~matches_form_one & ~matches_form_two]

  """Entry point for launching an IPython kernel.


137                         Unknown
205     60 million Norwegian Kroner
479                         Unknown
622     60 million Norwegian Kroner
975                     $34 million
1129                   $120 million
1229                        Unknown
1281                            HBO
1377                     £6,000,000
1400                     13 million
1483                   £2.8 million
1737                   CAD2,000,000
1916     PHP 85 million (estimated)
1951                    102,888,900
1956                   3,500,000 DM
1976                     ₤2,300,874
2285                     $14 milion
2455                     ₤6,350,000
3148                   € 40 million
3364                   $150 million
3422                        $218.32
3806                   £4.2 million
3910                            N/A
3963                    760,000 USD
4474                       19 crore
4645                    £17 million
5038                  $$200 million
5059                   $155 

In [29]:
wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)


In [30]:
wiki_movies_df.drop('Budget', axis=1, inplace=True)

In [31]:
wiki_movies_df.columns

Index(['url', 'year', 'imdb_link', 'title', 'Based on', 'Starring',
       'Cinematography', 'Release date', 'Running time', 'Country', 'Language',
       'Director', 'Distributor', 'Editor(s)', 'Composer(s)', 'Producer(s)',
       'Production company(s)', 'Writer(s)', 'imdb_id', 'box_office',
       'budget'],
      dtype='object')

In [32]:
release_date=wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x)==list else x)
release_date.sample(30)

1602                                      August 14, 1996
5254                         June 24, 2011 ( 2011-06-24 )
3100    January 16, 2002 ( 2002-01-16 ) (Sundance Film...
6889    March 11, 2018 ( 2018-03-11 ) ( SXSW ) [2] Mar...
2757                        April 28, 2000 ( 2000-04-28 )
4741                       October 3, 2008 ( 2008-10-03 )
6741    May 5, 2017 ( 2017-05-05 ) (United Kingdom) Se...
3144                      October 25, 2002 ( 2002-10-25 )
612     2 August 1991 ( 1991-08-02 ) (United Kingdom) ...
2099                      December 5, 1997 ( 1997-12-05 )
2823                          July 4, 2001 ( 2001-07-04 )
3472                      November 8, 2004 ( 2004-11-08 )
5860    September 7, 2012 ( 2012-09-07 ) ( TIFF ) July...
1765        November 1, 1996 ( 1996-11-01 ) (US; limited)
5080                         July 30, 2010 ( 2010-07-30 )
3108    November 2001 ( 2001-11 ) (Spain) April 26, 20...
913                          March 4, 1994 ( 1994-03-04 )
6699    June 1

In [33]:
date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
date_form_two = r'\d{4}.[01]\d.[0123]\d'
date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
date_form_four = r'\d{4}'

In [34]:
release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)

Unnamed: 0,0
0,"July 11, 1990"
1,"May 17, 1990"
2,"August 10, 1990"
3,"December 25, 1990"
4,"December 19, 1990"
...,...
7075,"December 25, 2018"
7076,"December 11, 2018"
7077,"November 8, 2018"
7078,"August 31, 2018"


In [35]:
wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)

In [36]:
running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

In [37]:
running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')
running_time_extract

Unnamed: 0,0,1,2
0,,,102
1,,,114
2,,,113
3,,,106
4,,,95
...,...,...,...
7075,,,90
7076,,,132
7077,,,120
7078,,,123


In [38]:
running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

In [39]:
wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)

In [40]:
wiki_movies_df.drop('Running time', axis=1, inplace=True)

In [50]:
kaggle_metadata[~kaggle_metadata['adult'].isin(['True','False'])]

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,/ff9qCepilowshEtG2GYWwzt2bs4.jpg,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,/zV8bHuSL6WXoD6FWogP9j4x80bL.jpg,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [51]:
kaggle_metadata = kaggle_metadata[kaggle_metadata['adult'] == 'False'].drop('adult',axis='columns')

In [52]:
kaggle_metadata['video'].value_counts()

False    45358
True        93
Name: video, dtype: int64

In [53]:
kaggle_metadata['video'] = kaggle_metadata['video'] == 'True'

In [54]:
kaggle_metadata['video'].value_counts()

False    45454
Name: video, dtype: int64

In [55]:
kaggle_metadata['budget'] = kaggle_metadata['budget'].astype(int)
kaggle_metadata['id'] = pd.to_numeric(kaggle_metadata['id'], errors='raise')
kaggle_metadata['popularity'] = pd.to_numeric(kaggle_metadata['popularity'], errors='raise')

In [56]:
kaggle_metadata.dtypes

belongs_to_collection     object
budget                     int64
genres                    object
homepage                  object
id                         int64
imdb_id                   object
original_language         object
original_title            object
overview                  object
popularity               float64
poster_path               object
production_companies      object
production_countries      object
release_date              object
revenue                  float64
runtime                  float64
spoken_languages          object
status                    object
tagline                   object
title                     object
video                       bool
vote_average             float64
vote_count               float64
dtype: object

In [57]:
kaggle_metadata['release_date'] = pd.to_datetime(kaggle_metadata['release_date'])