In [1]:
import time 
import json
import pandas as pd
import numpy as np

import re
import dateutil
from sqlalchemy import create_engine
import psycopg2
from config import db_password

In [2]:
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/movie_data" 



In [None]:
# Do Not run yet! STEP 1
rows_imported = 0
for data in pd.read_csv(f'{file_dir}ratings.csv', chunksize=1000000):

    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='ratings', con=engine, if_exists='append')
    rows_imported += len(data)

    print(f'Done.') 
# STEP 2 Print Elapsed Time
rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
for data in pd.read_csv(f'{file_dir}ratings.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='ratings', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

In [None]:
# STEP 2 Print Elapsed Time
rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
for data in pd.read_csv(f'{file_dir}ratings.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='ratings', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

In [3]:
def parse_dollars(s):  
    s = str(s)
    match = re.search(r"\$([\d,.]+)[^\d,.]?.*([mb])il",s) 
    if not match: 
        return np.nan 
    digit = float(match[1].replace(",",""))
    try: 
        multiplier=match[2]  
    except IndexError: 
        multiplier = 1.0 
    else: 
        multiplier = {"m":1e6, "b":1e9}[multiplier] 
    return digit*multiplier 

def parse_date(s): 
    if not isinstance(s,list): 
        s=[s] 
    for i in s: 
        try: 
            date = pd.to_datetime(i) 
        except: 
            continue 
        else: 
            return date 
        return np.nan 
    
def parse_time(s): 
    s = str(s) 
    m = re.search(r"(\d+)", s) 
    if not m: 
        return np.nan 
    else: 
        return pd.to_timedelta(float(m[1]),"min") 

In [4]:
#  Add the clean movie function that takes in the argument, "movie".
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    # combine alternate titles into one list
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune-Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles

    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')
    return movie

In [5]:
def wkm():
    # 2. Read in the kaggle metadata and MovieLens ratings CSV files as Pandas DataFrames.
    kaggle_metadata = pd.read_csv(kaggle_file) 
    ratings = pd.read_csv(ratings_file) 
    
    # 3. Open the read the Wikipedia data JSON file.
    with open (wiki_file,'r') as f: 
        wiki_movies = json.load(f) 
    wiki_movies = [entry for entry in wiki_movies if "No. of episodes" not in entry] 
    wiki_movies = [clean_movie(entry) for entry in wiki_movies]

    wiki_movies_with_id = []
    imdb_ids = set()
    for entry in wiki_movies:  
        try: 
            m = re.search(r"title/(tt\d{7,8})/", entry["imdb_link"]) 
            imdb_id = m[1]
        except Exception as e:  
            print(f"{entry.get('title', 'N/A')}: {e}")
            continue
        if imdb_id in imdb_ids: 
            continue 
        entry["imdb_id"] = imdb_id
        imdb_ids.add(imdb_id)
        wiki_movies_with_id.append(entry)
        
    wiki_movies_df = pd.DataFrame(wiki_movies_with_id)  
    wiki_movies_df = wiki_movies_df.dropna(how='all',axis=1)
    #box_office = wiki_movies_df["Box office"].dropna()
    wiki_movies_df["Box office"] = wiki_movies_df["Box office"].apply(parse_dollars) 
    wiki_movies_df["Release date"] = wiki_movies_df["Release date"].apply(parse_date) 
    wiki_movies_df["Running time"] = wiki_movies_df["Running time"].apply(parse_time)  
    
    kaggle_metadata = kaggle_metadata[kaggle_metadata['adult'] == 'False'].drop('adult',axis='columns') 
    kaggle_metadata['video'] = kaggle_metadata['video'] == 'True' 
    kaggle_metadata['budget'] = kaggle_metadata['budget'].astype(int)
    kaggle_metadata['id'] = pd.to_numeric(kaggle_metadata['id'], errors='raise')
    kaggle_metadata['popularity'] = pd.to_numeric(kaggle_metadata['popularity'], errors='raise') 
    kaggle_metadata['release_date'] = pd.to_datetime(kaggle_metadata['release_date']) 
    # 5. Return the three DataFrames
    # 5. Return the three DataFrames
    
    movies_df = pd.merge(wiki_movies_df, kaggle_metadata, on='imdb_id', suffixes=['_wiki','_kaggle'], how= "inner")  
    #Drop useless columns that clutter
    movies_df.drop(columns=
        [ "Polish", "Chinese", "Yiddish", "Arabic", 'Hebrew', "Russian", "Cantonese"
        , "Japanese", "McCune–Reischauer", "Revised Romanization", "Hangul", "French", "Mandarin"
        ], inplace=True)
    movies_df.drop(columns=["Hepburn","Species"], inplace=True) 
    #Drop columns required by step 5 
    movies_df.drop(columns=["Original language(s)","Original language","Language"], inplace=True)
    movies_df.drop(columns=['Production company(s)', 'Productioncompanies ', 'Productioncompany '], inplace=True)
    movies_df.drop(columns=['Release date'], inplace=True) 
    movies_df.drop(columns=['Original title', 'title_wiki'], inplace=True) 

    def fill_missing_kaggle_data(df, kaggle_column, wiki_column):
        df[kaggle_column] = df.apply(
            lambda row: row[wiki_column] if row[kaggle_column] == 0 else row[kaggle_column]
            , axis=1)
        df.drop(columns=[wiki_column], inplace=True)

    fill_missing_kaggle_data(movies_df, 'revenue', 'Box office')
    fill_missing_kaggle_data(movies_df, 'budget', 'Budget') 
    fill_missing_kaggle_data(movies_df, 'runtime', 'Running time')



    movies_df = (movies_df.loc[:, 
        ["imdb_id",'id','title_kaggle','original_title','tagline','belongs_to_collection', 'url','imdb_link','runtime', 
         'budget','revenue','release_date','popularity','vote_average', 'vote_count','genres','original_language', 
         'overview','spoken_languages','Country','production_companies','production_countries','Distributor','Producer(s)',
        'Director','Starring','Cinematography','Editor(s)','Written by','Composer(s)','Based on']])

    movies_df.rename({'id':'kaggle_id',
                      'title_kaggle':'title',
                      'url':'wikipedia_url',
                      'Country':'country',
                      'Distributor':'distributor',
                      'Producer(s)':'producers',
                      'Director':'director',
                      'Starring':'starring',
                      'Cinematography':'cinematography',
                      'Editor(s)':'editors',
                      'Written by':'writers',
                      'Composer(s)':'composers',
                      'Based on':'based_on'
                     }, axis='columns', inplace=True)

    rating_counts = ratings.groupby(['movieId','rating'], as_index=False).count() \
                    .rename({'userId':'count'}, axis=1) \
                    .pivot(index='movieId',columns='rating', values='count')

    rating_counts.columns = ['rating_' + str(col) for col in rating_counts.columns]  
    movies_with_ratings_df = pd.merge(movies_df, rating_counts, left_on='kaggle_id', right_index=True, how='left')
    movies_with_ratings_df[rating_counts.columns] = movies_with_ratings_df[rating_counts.columns].fillna(0)

    
    #return wiki_movies_df, kaggle_metadata, ratings
    #return wiki_movies_df, movies_with_ratings_df, rating_counts
    
    # Return
    #  * wiki file (cleaned, un-merged)
    #  * wiki file (cleaned) + kaggle file (cleaned) + ratings file (cleaned)
    #  * wiki file (cleaned) + kaggle file (cleaned)
    #return wiki_movies_df, movies_with_ratings_df, movies_df

    # Return
    #  * wiki file (cleaned, un-merged)
    #  * wiki file (cleaned) + kaggle file (cleaned)
    #  * wiki file (cleaned) + kaggle file (cleaned) + ratings file (cleaned)
    #  * kaggle_file (cleaned)
    #  * ratings file (cleaned)
    #return wiki_movies_df, movies_df, movies_with_ratings_df, kaggle_metadata, rating_counts

In [6]:
file_dir = "//Users/johncurran/Desktop/Rutgers Data Sci Bootcamp/Challenges/ETL Challenge/archive"
# Wikipedia data
wiki_file = f'{file_dir}/wikipedia-movies.json'
# Kaggle metadata
kaggle_file = f'{file_dir}/movies_metadata.csv'
# MovieLens rating data.
ratings_file = f'{file_dir}/ratings.csv'

# 7. Set the three variables in Step 6 equal to the function created in Step 1.
wiki_file, kaggle_file, ratings_file = wkm() #=extract_transform_load() 
#wiki_file, wiki_kaggle_file, wiki_kaggle_ratings_file, kaggle_file, ratings_file = wkm()

  wiki_file, kaggle_file, ratings_file = wkm() #=extract_transform_load()


N/A: 'imdb_link'
N/A: 'imdb_link'
A Man Called Sarge: 'imdb_link'
N/A: 'imdb_link'
Side Out: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
Frank Sinatra: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
Beethoven's 2nd: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
The Music of Chance: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
World and Time Enough: 'NoneType' object is not subscriptable
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
Transcriptions: 'imdb_link'
Go Now: 'imdb_link'
Man of the Year: 'NoneType' object is not subscriptable
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_link'
N/A: 'imdb_

KeyError: "['Polish' 'Chinese' 'Yiddish' 'Arabic' 'Hebrew' 'Russian' 'Cantonese'\n 'Japanese' 'Revised Romanization' 'Hangul' 'French' 'Mandarin'] not found in axis"

In [11]:
# Do Not run yet! STEP 1
rows_imported = 0
for data in pd.read_csv(f'{file_dir}ratings.csv', chunksize=1000000):

    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='ratings', con=engine, if_exists='append')
    rows_imported += len(data)

    print(f'Done.') 
# STEP 2 Print Elapsed Time
rows_imported = 0
# get the start_time from time.time()
start_time = time.time()
for data in pd.read_csv(f'{file_dir}ratings.csv', chunksize=1000000):
    print(f'importing rows {rows_imported} to {rows_imported + len(data)}...', end='')
    data.to_sql(name='ratings', con=engine, if_exists='append')
    rows_imported += len(data)

    # add elapsed time to final print out
    print(f'Done. {time.time() - start_time} total seconds elapsed')

FileNotFoundError: [Errno 2] No such file or directory: '//Users/johncurran/Desktop/Rutgers Data Sci Bootcamp/Challenges/ETL Challenge/archiveratings.csv'

In [7]:
# 12. Set the DataFrames from the return statement equal to the file names in Step 11. 
wiki_movies_df = wiki_file
movies_with_ratings_df = kaggle_file
movies_df = ratings_file

In [8]:
# 13. Check the wiki_movies_df DataFrame. 
wiki_movies_df.head()

AttributeError: 'str' object has no attribute 'head'

In [9]:
# 14. Check the movies_with_ratings_df DataFrame.
movies_with_ratings_df.head()

AttributeError: 'str' object has no attribute 'head'

In [10]:
# 15. Check the movies_df DataFrame. 
movies_df.head()

AttributeError: 'str' object has no attribute 'head'