In [2]:
# Dependencies
import json
import pandas as pd
import numpy as np

In [3]:
# Import json file
file_dir = "./"
filename = "Data/wikipedia-movies.json"
filepath = f'{file_dir}{filename}'

'./wikipedia-movies.json'

In [4]:
# Load raw JSON file
with open(filepath, mode='r') as file:
    wiki_movies_raw = json.load(file)

In [47]:
# Some checks
len(wiki_movies_raw)
wiki_movies_raw[:5]
wiki_movies_raw[-5:]
wiki_movies_raw[3500:3505]

[{'url': 'https://en.wikipedia.org/wiki/Milwaukee,_Minnesota',
  'year': 2003,
  'imdb_link': 'https://www.imdb.com/title/tt0285727/',
  'title': 'Milwaukee, Minnesota',
  'Directed by': 'Allan Mindel',
  'Produced by': ['Michael J. Brody', 'Jeff Kirshbaum'],
  'Written by': 'R.D. Murphy',
  'Starring': ['Troy Garity', 'Alison Folland', 'Bruce Dern', 'Randy Quaid'],
  'Music by': ['Michael Convertino', 'Bobby Muzingo'],
  'Cinematography': 'Bernd Heinl',
  'Edited by': 'David Rawlins',
  'Release date': ['January 24, 2003',
   '(',
   '2003-01-24',
   ')',
   '(',
   'Slamdance Film Festival',
   ')'],
  'Running time': '95 minutes',
  'Country': 'United States',
  'Language': 'English'},
 {'url': 'https://en.wikipedia.org/wiki/The_Missing_(2003_film)',
  'year': 2003,
  'imdb_link': 'https://www.imdb.com/title/tt0338188/',
  'title': 'The Missing',
  'Directed by': 'Ron Howard',
  'Produced by': ['Brian Grazer', 'Ron Howard', 'Daniel Ostroff'],
  'Written by': 'Ken Kaufman',
  'Based 

In [26]:
kaggle_metadata = pd.read_csv(f'{file_dir}Data/movies_metadata.csv', low_memory=False)
#kaggle_metadata.info()
kaggle_metadata.columns.to_list()


['adult',
 'belongs_to_collection',
 'budget',
 'genres',
 'homepage',
 'id',
 'imdb_id',
 'original_language',
 'original_title',
 'overview',
 'popularity',
 'poster_path',
 'production_companies',
 'production_countries',
 'release_date',
 'revenue',
 'runtime',
 'spoken_languages',
 'status',
 'tagline',
 'title',
 'video',
 'vote_average',
 'vote_count']

In [96]:
ratings = pd.read_csv(f'{file_dir}Data/ratings.csv')
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
 #   Column     Dtype  
---  ------     -----  
 0   userId     int64  
 1   movieId    int64  
 2   rating     float64
 3   timestamp  int64  
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


In [51]:
# Step 1.  Module 8.3.3
wiki_movies = [movie for movie in wiki_movies_raw
                if ('Director' in movie or 'Directed by' in movie)
                      and 'imdb_link' in movie
                      and 'No. of episodes' not in movie]
wiki_movies_df = pd.DataFrame(wiki_movies)
wiki_movies_df.head()
len(wiki_movies_df.columns.tolist())

75

In [52]:
#wiki_movies_df = pd.DataFrame(wiki_movies_raw)
sorted(wiki_movies_df.columns.tolist())

['Adaptation by',
 'Also known as',
 'Animation by',
 'Arabic',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cantonese',
 'Chinese',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Country of origin',
 'Created by',
 'Directed by',
 'Director',
 'Distributed by',
 'Distributor',
 'Edited by',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'French',
 'Genre',
 'Hangul',
 'Hebrew',
 'Hepburn',
 'Japanese',
 'Label',
 'Language',
 'Length',
 'Literally',
 'Mandarin',
 'McCune–Reischauer',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Original title',
 'Picture format',
 'Polish',
 'Preceded by',
 'Produced by',
 'Producer',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Productioncompanies ',
 'Productioncompany ',
 'Recorded',
 'Release date',
 'Released',
 'Revised Romanization',
 'Romanized',
 'Running time',
 'Russian',
 'Screen story by',
 'Screenplay by',
 'Simplifie

In [86]:
wiki_movies_df[wiki_movies_df['Revised Romanization'].notnull()]['Revised Romanization']

3413    Wondeopul deijeu
4238         Pai seutori
Name: Revised Romanization, dtype: object

In [53]:
wiki_movies_df.head(1)

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,


In [90]:
# Function to clean each movie
def clean_movie(movie):
    movie = dict(movie)  # Create a non-destructive copy
    # Consolidate alternative names
    alt_titles = {}
    title_keys = ['Also known as','Arabic', 'Cantonese', 'Chinese', 'French', 'Hangul', 'Hebrew', 'Hepburn', 'Japanese', 'Literally',
                  'Mandarin', 'McCune–Reischauer', 'Original title', 'Polish', 'Revised Romanization', 'Romanized',
                  'Russian', 'Simplified', 'Traditional', 'Yiddish']
    for key in title_keys:
        if key in movie:
            alt_titles[key]=movie[key]
            movie.pop(key)
    if len(alt_titles)>0:
        movie['alt_titles']=alt_titles
            
    return movie

In [54]:
wiki_movies_df[wiki_movies_df['Arabic'].notnull()]

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
6834,https://en.wikipedia.org/wiki/The_Insult_(film),2018,https://www.imdb.com/title/tt7048622/,The Insult,Ziad Doueiri,"[Rachid Bouchareb, Jean Bréhat, Julie Gayet, A...",,,,"[Adel Karam, Kamel El Basha]",...,,Case No. 23,,,,قضية رقم ٢٣,Qadiyya raqm 23,,,
7058,https://en.wikipedia.org/wiki/Capernaum_(film),2018,https://www.imdb.com/title/tt8267604/,Capernaum,Nadine Labaki,"[Michel Merkt, Khaled Mouzanar]","[Nadine Labaki, Jihad Hojaily, Michelle Keserw...","[Georges Khabbaz, Nadine Labaki, Michelle Kese...",,"[Zain Al Rafeea, Yordanos Shiferaw, Boluwatife...",...,,,,,,کفرناحوم‎,,,,


In [95]:
clean_movies = [clean_movie(movie) for movie in wiki_movies]
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

['Adaptation by',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Country of origin',
 'Created by',
 'Directed by',
 'Director',
 'Distributed by',
 'Distributor',
 'Edited by',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'Length',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Picture format',
 'Preceded by',
 'Produced by',
 'Producer',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Productioncompanies ',
 'Productioncompany ',
 'Recorded',
 'Release date',
 'Released',
 'Running time',
 'Screen story by',
 'Screenplay by',
 'Starring',
 'Story by',
 'Suggested by',
 'Theme music composer',
 'Venue',
 'Voices of',
 'Written by',
 'imdb_link',
 'title',
 'url',
 'year']