In [None]:
import json
import pandas as pd
import numpy as np
import re

In [None]:
file_dir = 'C:\\Users\keyto\git\Modules\Module 8\Movies_ETL'

In [None]:
f'{file_dir}filename'

In [None]:
with open(f'{file_dir}/wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw = json.load(file)

In [None]:
len(wiki_movies_raw)

In [None]:
# First 5 records
wiki_movies_raw[:5]

In [None]:
# Last 5 records
wiki_movies_raw[-5:]

In [None]:
# Some records in the middle
wiki_movies_raw[3600:3605]

In [None]:
kaggle_metadata = pd.read_csv(f'{file_dir}\\movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{file_dir}\\ratings.csv')

In [None]:
kaggle_metadata.head()

In [None]:
ratings.head()

In [None]:
kaggle_metadata.sample(n=5)

In [None]:
ratings.sample(n=5)

In [None]:
# [Inspect]
# --- Begin Initial Investigation --- 
wiki_movies_df = pd.DataFrame(wiki_movies_raw)

In [None]:
# Review top
wiki_movies_df.head()

In [None]:
# Review list of column names
wiki_movies_df.columns.tolist()

In [None]:
# [Plan]
# We can identify column names that don't relate to movie data,
# such as "Dewey Decimal," "Headquarters," and "Number of employees."
# (There may be other examples that jumped out at you as well.)
# Let's modify our JSON data by restricting it to only those entries
# that have a director and an IMDb link.
# We can do this with a list comprehension.

# We've used list comprehensions previously as a compact way to apply
# a function to every element in a list. 

# So far, we've used list comprehensions in the form to compress code
# that would have been done in a for loop.

# [expression for element in source_list]

# We can also filter out results using a conditional filter expression,
# as shown below:

# [expression for element in source_list if filter_expression]

# The resulting list will only have elements where the filter expression
# evaluates to True.

In [None]:
# [Plan]
# To create a filter expression for only movies with a director and an 
# IMDb link, keep in mind that there are two columns in the data for 
# director information. We'll need to check if either "Director" or
# "Directed by" are keys in the current dict. If there is a director listed,
# we also want to check that the dict has an IMDb link.
# Luckily, that information is only in one column, imdb_link,
# so our filter expression will look like the following:

# if ('Director' in movie or 'Directed by' in movie) and 'imdb_link' in movie

In [None]:
# [Execute]
# Create a list comprehension with the filter expression we created
# and save that to an intermediate variable wiki_movies.
# See how many movies are in wiki_movies with the len() function.

In [None]:
# Use List Comprehenshio to Filter Data
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie]
len(wiki_movies)

In [None]:
# [Inspect]
# 78 columns are still a lot of columns, so let's keep investigating.
wiki_movies_df = pd.DataFrame(wiki_movies)
wiki_movies_df

In [None]:
# [Inspect]
# There sure are a lot of languages—we'll get to those shortly. 
# For now, one of the columns that stands out is "No. of episodes."

# [Plan]
# It looks like we've got some TV shows in our data instead of movies.
# We'll want to get rid of those, too.

#[Execute]
# We'll add that filter to our list comprehension.

In [None]:
#[Execute]
# Add filter to our list comprehension.
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie
                   and 'No. of episodes' not in movie]

In [None]:
# Beginning of Module 8.3.4

#Call the function clean_movie and have it take movie as a parameter
def clean_movie(movie):
    
    # Because the movies are dicts and we want to make nondestructive edits, make a copy of the incoming movie.
    # To make a copy of movie, we'll use the dict() constructor.
    # Constructors are special functions that initialize new objects.
    # They reserve space in memory for the object and perform any initializations the object requires.
    # Also, constructors can take parameters and initialize a new object using those parameters.
    # movie_copy = dict(movie)
    # However, we have another trick that's even better.
    # Inside of the function, we can create a new local variable called movie and assign it the new copy of the parameter movie.
    movie = dict(movie) #create a non-destructive copy
    
    #To finish our skeleton of the clean_movie function, return the movie variable.
    return movie

In [None]:
wiki_movies_df[wiki_movies_df['Arabic'].notnull()]

In [None]:
wiki_movies_df[wiki_movies_df['Arabic'].notnull()]['url']

In [None]:
sorted(wiki_movies_df.columns.tolist())

In [None]:
#Call the function clean_movie and have it take movie as a parameter
def clean_movie(movie):
    
    # Because the movies are dicts and we want to make nondestructive edits, make a copy of the incoming movie.
    # To make a copy of movie, we'll use the dict() constructor.
    # Constructors are special functions that initialize new objects.
    # They reserve space in memory for the object and perform any initializations the object requires.
    # Also, constructors can take parameters and initialize a new object using those parameters.
    # movie_copy = dict(movie)
    # However, we have another trick that's even better.
    # Inside of the function, we can create a new local variable called movie and assign it the new copy of the parameter movie.
    
    #create a non-destructive copy
    movie = dict(movie) 
    
    # make empty dict to hold all of the alternative titles.
    alt_titles = {} 
    
    # Loop through a list of all alternative title keys
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        
        # Check if the current key exists in the movie object.
        if key in movie:
            
            # If so, remove the key-value pair and add to the alternative titles dictionary.
            alt_titles[key] = movie[key]
            movie.pop(key)  
            
    # After looping through every key, add the alternative titles dict to the movie object.
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles
            
    #To finish our skeleton of the clean_movie function, return the movie variable.
    return movie



In [None]:
# Make a list of cleaned movies with a list comprehension:
clean_movies = [clean_movie(movie) for movie in wiki_movies]

In [None]:
# Set wiki_movies_df to be the DataFrame created from clean_movies, and print out a list of columns.
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

In [None]:
#Call the function clean_movie and have it take movie as a parameter
def clean_movie(movie):
    
    # Because the movies are dicts and we want to make nondestructive edits, make a copy of the incoming movie.
    # To make a copy of movie, we'll use the dict() constructor.
    # Constructors are special functions that initialize new objects.
    # They reserve space in memory for the object and perform any initializations the object requires.
    # Also, constructors can take parameters and initialize a new object using those parameters.
    # movie_copy = dict(movie)
    # However, we have another trick that's even better.
    # Inside of the function, we can create a new local variable called movie and assign it the new copy of the parameter movie.
    
    #create a non-destructive copy
    movie = dict(movie) 
    
    # make empty dict to hold all of the alternative titles.
    alt_titles = {} 
    
    # Loop through a list of all alternative title keys
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune–Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        
        # Check if the current key exists in the movie object.
        if key in movie:
            
            # If so, remove the key-value pair and add to the alternative titles dictionary.
            alt_titles[key] = movie[key]
            movie.pop(key)  
            
    # After looping through every key, add the alternative titles dict to the movie object.
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles
    
    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')
               
    #To finish our skeleton of the clean_movie function, return the movie variable.
    return movie



In [None]:
clean_movies = [clean_movie(movie) for movie in wiki_movies]
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

In [None]:
# "(tt\d{7})" — The parentheses marks say to look for one group of text.
# "(tt\d{7})" — The "tt" in the string simply says to match two lowercase Ts.
# "(tt\d{7})" — The "\d" says to match a numerical digit.
# "(tt\d{7})" — The "{7}" says to match the last thing (numerical digits) exactly seven times.

In [None]:
# Code to extract the IMBd ID 
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')

In [None]:
# Now we can drop any duplicates of IMDb IDs by using the drop_dupilcates() method.
# To specify that we only want to consider the IMBd ID, use the subset argument,
# and set inplace equal to True so that the operation is performed on the slected dataframe.
# Otherwise, the operation would return an edited dataframe that would need to be saved to a new variable.
# We also want to see the new number of rows and how many rows were dropped.

wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
print(len(wiki_movies_df))
wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
print(len(wiki_movies_df))
wiki_movies_df.head()

In [None]:
# Get the count of null values for each column by using list comprehension
# Could also use a for loop and a print statement
[[column,wiki_movies_df[column].isnull().sum()] for column in wiki_movies_df.columns]

In [None]:
# Make a list of columns that have less than 90% null values and use those to trim down our dataset.
[column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]

In [None]:
# The above gave us the columns that we want to keep, whcih we can select from out Pandas Dataframe as follows:
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]

In [None]:
# You may have noticed that the "alt_titles" column we created earlier was deleted by this bit of code.
# It might feel like all that work we did was futile, but it's not.
# It's possible that all of the alternate title columns individually had less than 10% non-null values,
# but collectively had enough data to keep. We wouldn't know that unless we put in that work.
# This is normal for data cleaning because it's an iterative process.
# Sometimes the hard work you put in doesn't seem to make it to the final product, but don't worry, it's in there.

In [None]:
# Identify which columns need to be converted
wiki_movies_df.dtypes

In [None]:
box_office = wiki_movies_df['Box office'].dropna()

In [None]:
# Regular expressions only work on strings, so we'll need to make sure all of the box office data is entered as a string.
# By using the map() method, we can see which values are not strings.
# First, make a is_not_a_string() function: 

def is_not_a_string(x):
    return type(x) != str

box_office[box_office.map(is_not_a_string)]

In [None]:
# Having to create a new function every time we want to use the map() method is cumbersome
# and interrupts the readability of our code. What we want is a stripped-down, one-line way of writing our functions.
# Also, we don't need to use it ever again outside of our map() call, so we don't need to give it a name.
# If you think we're talking about types of functions that will work here, you're right.

In [None]:
# Remember, this is what lambda functions are made for.
# Instead of creating a new function with a block of code and the def keyword,
# we can create an anonymous lambda function right inside the map() call.
# They use the following syntax:
# lambda arguments: expression

In [None]:
# So the lambda function version of is_not_a_string() is:
# lambda x: type(x) != str

In [None]:
# We can update our map() call to use the lambda function directly instead of using is_not_a_string():

In [None]:
box_office[box_office.map(lambda x: type(x) != str)]

In [None]:
# From the output, we can see that there are quite a few data points that are stored as lists.
# There is a join() string method that concatenates list items into one string;
# however, we can't just type join(some_list) because the join() method belongs to string objects.
# We need to make a separator string and then call the join() method on it. For example, the code would be:

# some_list = ['One','Two','Three']
# 'Mississippi'.join(some_list)

# The outputs would be:
# 'OneMississippiTwoMississippiThree'

# We'll use a simple space as our joining character and apply the join()
# function only when our data points are lists.
# The code looks like the following:

# box_office = box_office.apply(lambda x: ' '.join(x) if type(x) == list else x)

In [None]:
# There is a built-in Python module for regular expressions: re.
# We'll need to import that library, so add the line below to the first cell,
# with the other import statements, and rerun the cell.

import re

In [None]:
# Module 8.3.9