In [1]:
import time 
import json
import pandas as pd
import numpy as np

import re
import dateutil
from sqlalchemy import create_engine
import psycopg2

# from config import db_password

In [2]:
def parse_dollars(s):  
    s = str(s)
    match = re.search(r"\$([\d,.]+)[^\d,.]?.*([mb])il",s) 
    if not match: 
        return np.nan 
    digit = float(match[1].replace(",",""))
    try: 
        multiplier=match[2]  
    except IndexError: 
        multiplier = 1.0 
    else: 
        multiplier = {"m":1e6, "b":1e9}[multiplier] 
    return digit*multiplier 

def parse_date(s): 
    if not isinstance(s,list): 
        s=[s] 
    for i in s: 
        try: 
            date = pd.to_datetime(i) 
        except: 
            continue 
        else: 
            return date 
        return np.nan 
    
def parse_time(s): 
    s = str(s) 
    m = re.search(r"(\d+)", s) 
    if not m: 
        return np.nan 
    else: 
        return pd.to_timedelta(float(m[1]),"min") 

In [3]:
#  Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):

    
    
    

    return movie

In [13]:
def wkm():
    # 2. Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_metadata = pd.read_csv(kaggle_file) 
    ratings = pd.read_csv(ratings_file) 
    
    # 3. Open the read the Wikipedia data JSON file.
    with open (wiki_file,'r') as f: 
        wiki_movies = json.load(f) 
    wiki_movies = [entry for entry in wiki_movies if "No. of episodes" not in entry] 
    wiki_movies = [clean_movie(entry) for entry in wiki_movies]

    wiki_movies_with_id = []
    imdb_ids = set()
    for entry in wiki_movies:  
        try: 
            m = re.search(r"title/(tt\d{7,8})/", entry["imdb_link"]) 
            imdb_id = m[1]
        except Exception as e:  
            print(f"{entry.get('title', 'N/A')}: {e}")
            continue
        if imdb_id in imdb_ids: 
            continue 
        entry["imdb_id"] = imdb_id
        imdb_ids.add(imdb_id)
        wiki_movies_with_id.append(entry)
        
    wiki_movies_df = pd.DataFrame(wiki_movies_with_id)  
    wiki_movies_df = wiki_movies_df.dropna(how='all',axis=1)
    #box_office = wiki_movies_df["Box office"].dropna()
    wiki_movies_df["Box office"] = wiki_movies_df["Box office"].apply(parse_dollars) 
    wiki_movies_df["Release date"] = wiki_movies_df["Release date"].apply(parse_date) 
    wiki_movies_df["Running time"] = wiki_movies_df["Running time"].apply(parse_time)  
    
    kaggle_metadata = kaggle_metadata[kaggle_metadata['adult'] == 'False'].drop('adult',axis='columns') 
    kaggle_metadata['video'] = kaggle_metadata['video'] == 'True' 
    kaggle_metadata['budget'] = kaggle_metadata['budget'].astype(int)
    kaggle_metadata['id'] = pd.to_numeric(kaggle_metadata['id'], errors='raise')
    kaggle_metadata['popularity'] = pd.to_numeric(kaggle_metadata['popularity'], errors='raise') 
    kaggle_metadata['release_date'] = pd.to_datetime(kaggle_metadata['release_date']) 
    # 5. Return the three DataFrames
    # 5. Return the three DataFrames
    
    movies_df = pd.merge(wiki_movies_df, kaggle_metadata, on='imdb_id', suffixes=['_wiki','_kaggle'], how= "inner")  
    #Drop useless columns that clutter
    movies_df.drop(columns=
        [ "Polish", "Chinese", "Yiddish", "Arabic", 'Hebrew', "Russian", "Cantonese"
        , "Japanese", "McCune–Reischauer", "Revised Romanization", "Hangul", "French", "Mandarin"
        ], inplace=True)
    movies_df.drop(columns=["Hepburn","Species"], inplace=True) 
    #Drop columns required by step 5 
    movies_df.drop(columns=["Original language(s)","Original language","Language"], inplace=True)
    movies_df.drop(columns=['Production company(s)', 'Productioncompanies ', 'Productioncompany '], inplace=True)
    movies_df.drop(columns=['Release date'], inplace=True) 
    movies_df.drop(columns=['Original title', 'title_wiki'], inplace=True) 

    def fill_missing_kaggle_data(df, kaggle_column, wiki_column):
        df[kaggle_column] = df.apply(
            lambda row: row[wiki_column] if row[kaggle_column] == 0 else row[kaggle_column]
            , axis=1)
        df.drop(columns=[wiki_column], inplace=True)

    fill_missing_kaggle_data(movies_df, 'revenue', 'Box office')
    fill_missing_kaggle_data(movies_df, 'budget', 'Budget') 
    fill_missing_kaggle_data(movies_df, 'runtime', 'Running time')



    movies_df = (movies_df.loc[:, 
        ["imdb_id",'id','title_kaggle','original_title','tagline','belongs_to_collection', 'url','imdb_link','runtime', 
         'budget','revenue','release_date','popularity','vote_average', 'vote_count','genres','original_language', 
         'overview','spoken_languages','Country','production_companies','production_countries','Distributor','Producer(s)',
        'Director','Starring','Cinematography','Editor(s)','Written by','Composer(s)','Based on']])

    movies_df.rename({'id':'kaggle_id',
                      'title_kaggle':'title',
                      'url':'wikipedia_url',
                      'Country':'country',
                      'Distributor':'distributor',
                      'Producer(s)':'producers',
                      'Director':'director',
                      'Starring':'starring',
                      'Cinematography':'cinematography',
                      'Editor(s)':'editors',
                      'Written by':'writers',
                      'Composer(s)':'composers',
                      'Based on':'based_on'
                     }, axis='columns', inplace=True)

    rating_counts = ratings.groupby(['movieId','rating'], as_index=False).count() \
                    .rename({'userId':'count'}, axis=1) \
                    .pivot(index='movieId',columns='rating', values='count')

    rating_counts.columns = ['rating_' + str(col) for col in rating_counts.columns]  
    movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on='kaggle_id', right_index=True, how='left')
    movies_with_ratings_df[rating_counts.columns] = movies_with_ratings_df[rating_counts.columns].fillna(0)

    
    #return wiki_movies_df, kaggle_metadata, ratings
    #return wiki_movies_df, movies_with_ratings_df, rating_counts
    
    # Return
    #  * wiki file (cleaned, un-merged)
    #  * wiki file (cleaned) + kaggle file (cleaned) + ratings file (cleaned)
    #  * wiki file (cleaned) + kaggle file (cleaned)
    return wiki_movies_df, movies_with_ratings_df, movies_df

    # Return
    #  * wiki file (cleaned, un-merged)
    #  * wiki file (cleaned) + kaggle file (cleaned)
    #  * wiki file (cleaned) + kaggle file (cleaned) + ratings file (cleaned)
    #  * kaggle_file (cleaned)
    #  * ratings file (cleaned)
    #return wiki_movies_df, movies_df, movies_with_ratings_df, kaggle_metadata, rating_counts

In [14]:
file_dir = "//Users/johncurran/Desktop/Rutgers Data Sci Bootcamp/Challenges/ETL Challenge/archive"
# Wikipedia data
wiki_file = f'{file_dir}/wikipedia-movies.json'
# Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

# 7. Set the three variables in Step 6 equal to the function created in Step 1.
wiki_file, kaggle_file, ratings_file = wkm() #=extract_transform_load() 
#wiki_file, wiki_kaggle_file, wiki_kaggle_ratings_file, kaggle_file, ratings_file = wkm()

  wiki_file, kaggle_file, ratings_file = wkm() #=extract_transform_load()


N/A: 'imdb_link'
N/A: 'imdb_link'
A Man Called Sarge: 'imdb_link'
N/A: 'imdb_link'
Side Out: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
Frank Sinatra: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
Beethoven's 2nd: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
The Music of Chance: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
World and Time Enough: 'NoneType' object is not subscriptable
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
Transcriptions: 'imdb_link'
Go Now: 'imdb_link'
Man of the Year: 'NoneType' object is not subscriptable
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_

In [15]:
# 12. Set the DataFrames from the return statement equal to the file names in Step 11. 
wiki_movies_df = wiki_file
movies_with_ratings_df = kaggle_file
movies_df = ratings_file

In [16]:
# 13. Check the wiki_movies_df DataFrame. 
wiki_movies_df.head()

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,,,,,,,,,,
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,Roger Spottiswoode,Daniel Melnick,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",...,,,,,,,,,,
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,Woody Allen,Robert Greenhut,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",...,,,,,,,,,,
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,John Cornell,John Cornell,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",...,,,,,,,,,,


In [17]:
# 14. Check the movies_with_ratings_df DataFrame.
movies_with_ratings_df.head()

Unnamed: 0,imdb_id,kaggle_id,title,original_title,tagline,belongs_to_collection,wikipedia_url,imdb_link,runtime,budget,...,rating_0.5,rating_1.0,rating_1.5,rating_2.0,rating_2.5,rating_3.0,rating_3.5,rating_4.0,rating_4.5,rating_5.0
0,tt0098987,9548,The Adventures of Ford Fairlane,The Adventures of Ford Fairlane,Kojak. Columbo. Dirty Harry. Wimps.,,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.imdb.com/title/tt0098987/,104.0,49000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,tt0098994,25501,"After Dark, My Sweet","After Dark, My Sweet",All they risked was everything.,,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",https://www.imdb.com/title/tt0098994/,114.0,$6 million,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,tt0099005,11856,Air America,Air America,The few. The proud. The totally insane.,,https://en.wikipedia.org/wiki/Air_America_(film),https://www.imdb.com/title/tt0099005/,112.0,$35 million,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,tt0099012,8217,Alice,Alice,,,https://en.wikipedia.org/wiki/Alice_(1990_film),https://www.imdb.com/title/tt0099012/,102.0,$12 million,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,tt0099018,25943,Almost an Angel,Almost an Angel,Who does he think he is?,,https://en.wikipedia.org/wiki/Almost_an_Angel,https://www.imdb.com/title/tt0099018/,95.0,$25 million,...,3.0,0.0,3.0,2.0,5.0,26.0,37.0,46.0,16.0,11.0


In [18]:
# 15. Check the movies_df DataFrame. 
movies_df.head()

Unnamed: 0,imdb_id,kaggle_id,title,original_title,tagline,belongs_to_collection,wikipedia_url,imdb_link,runtime,budget,...,production_countries,distributor,producers,director,starring,cinematography,editors,writers,composers,based_on
0,tt0098987,9548,The Adventures of Ford Fairlane,The Adventures of Ford Fairlane,Kojak. Columbo. Dirty Harry. Wimps.,,https://en.wikipedia.org/wiki/The_Adventures_o...,https://www.imdb.com/title/tt0098987/,104.0,49000000,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",,,,"[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",Oliver Wood,,,,"[Characters, by Rex Weiner]"
1,tt0098994,25501,"After Dark, My Sweet","After Dark, My Sweet",All they risked was everything.,,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",https://www.imdb.com/title/tt0098994/,114.0,$6 million,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",,,,"[Jason Patric, Rachel Ward, Bruce Dern, George...",Mark Plummer,,,,"[the novel, After Dark, My Sweet, by, Jim Thom..."
2,tt0099005,11856,Air America,Air America,The few. The proud. The totally insane.,,https://en.wikipedia.org/wiki/Air_America_(film),https://www.imdb.com/title/tt0099005/,112.0,$35 million,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",,,,"[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",Roger Deakins,,,,"[Air America, by, Christopher Robbins]"
3,tt0099012,8217,Alice,Alice,,,https://en.wikipedia.org/wiki/Alice_(1990_film),https://www.imdb.com/title/tt0099012/,102.0,$12 million,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",Carlo Di Palma,,Woody Allen,,
4,tt0099018,25943,Almost an Angel,Almost an Angel,Who does he think he is?,,https://en.wikipedia.org/wiki/Almost_an_Angel,https://www.imdb.com/title/tt0099018/,95.0,$25 million,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",Russell Boyd,,Paul Hogan,,
