In [1]:
#import dependencies
import pandas as pd
import numpy as np
import json
import re

In [2]:
file_dir="Data/"

In [3]:
with open(f'{file_dir}wikipedia-movies.json', mode='r') as file:
        wiki_movies_raw=json.load(file)

In [4]:
len(wiki_movies_raw)

7311

In [5]:
kaggle_metadata=pd.read_csv(f'{file_dir}movies_metadata.csv', low_memory=False)
ratings=pd.read_csv(f'{file_dir}ratings.csv')

In [6]:
kaggle_metadata[kaggle_metadata["original_language"]=='en'].sample(n=5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
26070,False,,0,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,38020,tt0078089,en,Planet of Dinosaurs,A space-ship gets lost and is forced to make a...,...,1977-11-18,0.0,84.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Trapped On A Lost World of Prehistoric Monsters,Planet of Dinosaurs,False,4.4,8.0
17435,False,"{'id': 118474, 'name': 'Truth in 24 Collection...",0,"[{'id': 99, 'name': 'Documentary'}]",http://www.truthin24.com/,16900,tt1320302,en,Truth In 24,"It's 24 hours of pure exhilaration, complete e...",...,2008-11-06,0.0,95.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Truth In 24,False,6.6,8.0
12915,False,,0,"[{'id': 99, 'name': 'Documentary'}]",,22319,tt1149405,en,Trouble the Water,"""Trouble the Water"" takes you inside Hurricane...",...,2008-01-20,0.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,It's not about a hurricane. It's about America.,Trouble the Water,False,6.4,7.0
14322,False,,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,30506,tt0037193,en,The Princess and the Pirate,Princess Margaret is travelling incognito to e...,...,1944-11-10,0.0,94.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Princess and the Pirate,False,6.5,4.0
1327,False,"{'id': 151, 'name': 'Star Trek: The Original S...",24000000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,168,tt0092007,en,Star Trek IV: The Voyage Home,Fugitives of the Federation for their daring r...,...,1986-11-25,133000000.0,119.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The key to saving the future can only be found...,Star Trek IV: The Voyage Home,False,6.9,490.0


In [8]:
wiki_movies_df = pd.DataFrame(wiki_movies_raw)
len(wiki_movies_df)

7311

In [9]:
wiki_movies = [movie for movie in wiki_movies_raw
              if ('Director' in movie or 'Directed by' in movie) 
                and 'imdb_link' in movie]

In [10]:
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    # combine alternate titles into one list
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune-Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles

    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')

    return movie

In [11]:
clean_movies = [clean_movie(movie) for movie in wiki_movies]
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

['Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Camera setup',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'McCune–Reischauer',
 'Narrated by',
 'No. of episodes',
 'Opening theme',
 'Original language(s)',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Recorded',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Venue',
 'Voices of',
 'Writer(s)',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [12]:
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')

In [13]:
[[column,wiki_movies_df[column].isna().sum()] for column in wiki_movies_df]

[['url', 0],
 ['year', 0],
 ['imdb_link', 0],
 ['title', 1],
 ['Based on', 4889],
 ['Starring', 185],
 ['Narrated by', 6797],
 ['Cinematography', 700],
 ['Release date', 32],
 ['Running time', 140],
 ['Country', 239],
 ['Language', 248],
 ['Budget', 2315],
 ['Box office', 1563],
 ['Director', 0],
 ['Distributor', 362],
 ['Editor(s)', 551],
 ['Composer(s)', 520],
 ['Producer(s)', 203],
 ['Production company(s)', 1688],
 ['Writer(s)', 201],
 ['Genre', 6966],
 ['Original language(s)', 6918],
 ['Original network', 6951],
 ['Executive producer(s)', 6980],
 ['Production location(s)', 7030],
 ['Picture format', 7012],
 ['Audio format', 7016],
 ['Voices of', 7078],
 ['Followed by', 7071],
 ['Created by', 7069],
 ['Opening theme', 7079],
 ['No. of episodes', 7076],
 ['alt_titles', 7058],
 ['Preceded by', 7070],
 ['Suggested by', 7079],
 ['Recorded', 7078],
 ['Venue', 7079],
 ['Label', 7078],
 ['Animation by', 7078],
 ['Color process', 7079],
 ['Camera setup', 7079],
 ['McCune–Reischauer', 7078]

In [14]:
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]

In [15]:
wiki_movies_df.dtypes

url                      object
year                      int64
imdb_link                object
title                    object
Based on                 object
Starring                 object
Cinematography           object
Release date             object
Running time             object
Country                  object
Language                 object
Budget                   object
Box office               object
Director                 object
Distributor              object
Editor(s)                object
Composer(s)              object
Producer(s)              object
Production company(s)    object
Writer(s)                object
imdb_id                  object
dtype: object

In [16]:
box_office = wiki_movies_df['Box office'].dropna()

In [17]:
box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)
box_office

0          $21.4 million
1           $2.7 million
2            $57,718,089
3             $7,331,647
4       $6,939,946 (USA)
              ...       
7074       $19.4 million
7075       $41.9 million
7076       $76.1 million
7077       $38.4 million
7078        $5.5 million
Name: Box office, Length: 5517, dtype: object

In [18]:
form_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'
form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)'

box_office.str.contains(form_one, flags=re.IGNORECASE, na=False).sum()

3920

In [19]:
box_office.str.contains(form_two, flags=re.IGNORECASE, na=False).sum()

1574

In [20]:
box_office = box_office.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [21]:
matches_form_one=box_office.str.contains(form_one, flags=re.IGNORECASE, na=False)
matches_form_two=box_office.str.contains(form_two, flags=re.IGNORECASE, na=False)
box_office[~matches_form_one & ~matches_form_two]

602                      $5000 (US)
1072                     35,254,617
1483                     £3 million
1868                   ¥1.1 billion
2036                            N/A
2095                           $309
2669    926,423 admissions (France)
3635                            TBA
3883       CN¥3.650 million (China)
4120                     £7,385,434
4310                            $30
4565        $45.2k (only in Turkey)
5451                          £2.56
5788                       413 733$
6017                        Unknown
6373                          $111k
6374                           $588
6597                 less than $372
6847                        8 crore
Name: Box office, dtype: object

In [22]:
box_office.str.extract(f'({form_one}|{form_two})')

Unnamed: 0,0
0,$21.4 million
1,$2.7 million
2,"$57,718,089"
3,"$7,331,647"
4,"$6,939,946"
...,...
7074,$19.4 million
7075,$41.9 million
7076,$76.1 million
7077,$38.4 million


In [23]:
def parse_dollars(s):

    # if s is not a string, return NaN
    if type(s) != str:
        return np.nan
    
    # if input is of the form $###.# million
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):
    
        # remove dollar sign and " million"
        s = re.sub('\$|\s*|[a-zA-Z]','',s)
        
        # convert to float and multiply by a million
        value= float(s)*10**6
        
        # return value
        return value
        
    # if input is of the form $###.# billion
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):
    
        # remove dollar sign and " billion"
        s = re.sub(r'\$|\s*|[a-zA-Z]','',s)
        
        # convert to float and multiply by a billion
        value=float(s)*10**9
        
        # return value
        return value
        
    # if input is of the form $###,###,###
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illi?on)', s, flags=re.IGNORECASE):
        
        # remove dollar sign and commas
        s = re.sub('\$|,','', s)
        
        # convert to float
        value=float(s)
        
        # return value
        return value
        
    # otherwise, return NaN
    else:
        return np.nan
    

In [24]:
wiki_movies_df['box_office'] = box_office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)
wiki_movies_df['box_office']

0       21400000.0
1        2700000.0
2       57718089.0
3        7331647.0
4        6939946.0
           ...    
7075    41900000.0
7076    76100000.0
7077    38400000.0
7078     5500000.0
7079           NaN
Name: box_office, Length: 7080, dtype: float64

In [25]:
wiki_movies_df.drop('Box office', axis=1, inplace=True)

In [27]:
budget=wiki_movies_df['Budget'].dropna()
budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)

4294               $65 million
3634               $23 million
456                $14 million
3179                  $500,000
566                $40 million
2952               $21 million
4376               $25 million
3483               $13 million
309                $18 million
3846               $12 million
2476               $30 million
6648              $110 million
2663               $44 million
3685               $20 million
1493                  $125,000
3430               $80 million
3426              $9.5 million
4274               $16 million
657                $35 million
2036    $4,000,000 (estimated)
5408               $25 million
2421                $7 million
5369               $15 million
6918             $10.5 million
1134               $34 million
5050           $40–$45 million
1660               $42 million
5972             $13.5 million
755                $40 million
6742               $35 million
Name: Budget, dtype: object

In [31]:
budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [38]:
matches_form_one=budget.str.contains(form_one, flags=re.IGNORECASE, na=False)
matches_form_two=budget.str.contains(form_two, flags=re.IGNORECASE, na=False)
budget[~matches_form_one & ~matches_form_two]

  


137                         Unknown
205     60 million Norwegian Kroner
479                         Unknown
622     60 million Norwegian Kroner
975             $34 [3] [4] million
1129               $120 [4] million
1229                        Unknown
1281                            HBO
1377                     £6,000,000
1400                     13 million
1483                   £2.8 million
1737                   CAD2,000,000
1916     PHP 85 million (estimated)
1951                    102,888,900
1956                   3,500,000 DM
1976                     ₤2,300,874
2285                     $14 milion
2455                     ₤6,350,000
3148                   € 40 million
3364               $150 [6] million
3422                        $218.32
3806                   £4.2 million
3910                            N/A
3963                    760,000 USD
4474                       19 crore
4645                    £17 million
5038              $$200 [4] million
5059           $155 [2] [3] 

In [39]:
budget = budget.str.replace(r'\[\d+\]\s*', '')
budget[~matches_form_one & ~matches_form_two]

  """Entry point for launching an IPython kernel.


137                         Unknown
205     60 million Norwegian Kroner
479                         Unknown
622     60 million Norwegian Kroner
975                     $34 million
1129                   $120 million
1229                        Unknown
1281                            HBO
1377                     £6,000,000
1400                     13 million
1483                   £2.8 million
1737                   CAD2,000,000
1916     PHP 85 million (estimated)
1951                    102,888,900
1956                   3,500,000 DM
1976                     ₤2,300,874
2285                     $14 milion
2455                     ₤6,350,000
3148                   € 40 million
3364                   $150 million
3422                        $218.32
3806                   £4.2 million
3910                            N/A
3963                    760,000 USD
4474                       19 crore
4645                    £17 million
5038                  $$200 million
5059                   $155 

In [41]:
wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)


In [50]:
wiki_movies_df.drop('Budget', axis=1, inplace=True)

KeyError: "['Budget'] not found in axis"

In [51]:
wiki_movies_df.columns

Index(['url', 'year', 'imdb_link', 'title', 'Based on', 'Starring',
       'Cinematography', 'Release date', 'Running time', 'Country', 'Language',
       'Director', 'Distributor', 'Editor(s)', 'Composer(s)', 'Producer(s)',
       'Production company(s)', 'Writer(s)', 'imdb_id', 'box_office',
       'budget'],
      dtype='object')

In [54]:
release_date=wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x)==list else x)
release_date.sample(30)

1721                     September 3, 1996 ( 1996-09-03 )
5911    April 14, 2013 ( 2013-04-14 ) ( Grand Rex ) Ma...
6667    September 11, 2016 ( 2016-09-11 ) ( TIFF ) Apr...
5162    September 14, 2009 ( 2009-09-14 ) ( TIFF ) May...
3221                         March 1, 2002 ( 2002-03-01 )
1162                          June 2, 1995 ( 1995-06-02 )
4905    September 4, 2008 ( 2008-09-04 ) ( Venice ) Ju...
3489                     November 24, 2004 ( 2004-11-24 )
1462                                    December 15, 1996
4166    3 December 2006 ( 2006-12-03 ) (London) 5 Janu...
6980    May 14, 2018 ( 2018-05-14 ) ( Cannes ) August ...
538                           July 1, 1992 ( 1992-07-01 )
2418                     December 10, 1999 ( 1999-12-10 )
4350    July 25, 2007 ( 2007-07-25 ) ( ArcLight Hollyw...
380                        August 16, 1991 ( 1991-08-16 )
4984    March 11, 2009 ( 2009-03-11 ) (Egypt) March 13...
5838    June 27, 2013 ( 2013-06-27 ) ( VOD ) August 2,...
4179          

In [55]:
date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
date_form_two = r'\d{4}.[01]\d.[0123]\d'
date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
date_form_four = r'\d{4}'

In [56]:
release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)

Unnamed: 0,0
0,"July 11, 1990"
1,"May 17, 1990"
2,"August 10, 1990"
3,"December 25, 1990"
4,"December 19, 1990"
...,...
7075,"December 25, 2018"
7076,"December 11, 2018"
7077,"November 8, 2018"
7078,"August 31, 2018"


In [57]:
wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)

In [58]:
wiki_movies_df['release_date']

0      1990-07-11
1      1990-05-17
2      1990-08-10
3      1990-12-25
4      1990-12-19
          ...    
7075   2018-12-25
7076   2018-12-11
7077   2018-11-08
7078   2018-08-31
7079   2018-12-01
Name: release_date, Length: 7080, dtype: datetime64[ns]

In [65]:
running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')
running_time_extract

Unnamed: 0,0,1,2
0,,,102
1,,,114
2,,,113
3,,,106
4,,,95
...,...,...,...
7075,,,90
7076,,,132
7077,,,120
7078,,,123


In [63]:
running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE, na=False).sum()

6923

In [66]:
running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

In [67]:
wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)

In [68]:
wiki_movies_df.drop('Running time', axis=1, inplace=True)