# Job Thomas Thekkekara
# R00195427

## Data Cleaning Function

In [1]:
def data_cleaning(data):
    #print(data.shape) # checking row and column size of data
    duplicate=data[data.duplicated()] # check if duplicate records are present
    #print(duplicate.shape[0])
    data.drop_duplicates(keep='first', inplace=True)# first row in duplicate records retained. data dropped from original data frame
    # one movie cannot come more than once
    data.drop_duplicates(subset='movie_title', keep="first", inplace=True)
    #print(data.info()) # checking data types
    
    # checking number of null values in each column
    #print(data.isnull().sum())
    
    data.columns = data.columns.str.strip()# stripping of spaces in columns
    
    # replacing missing color values with mode
    mode_color=data['color'].value_counts().idxmax() # value_counts() gives the count in each category. idxmax() returns the category with largest value
    data['color'].replace(np.nan,mode_color,inplace=True) # replacing NA values with mode
    
    # Director missing values dropped
    data.dropna(subset=["director_name"],axis=0,inplace=True)
    data.reset_index(drop=True,inplace=True)
    
    #critic reviews replaced by average
    avg_critic=math.floor(data["num_critic_for_reviews"].mean())# lower whole number in case a decimal value appears after applying mean()
    data["num_critic_for_reviews"].replace(np.nan,avg_critic,inplace=True)# replacing NA values with mean
    
    # duration replaced with average
    avg_duration=math.floor(data["duration"].mean()) 
    data["duration"].replace(np.nan,avg_critic,inplace=True)
    
    # actor 3,2,1 fb like replaced with average
    avg_actor3_likes=math.floor(data["actor_3_facebook_likes"].mean())
    avg_actor2_likes=math.floor(data["actor_2_facebook_likes"].mean())
    avg_actor1_likes=math.floor(data["actor_1_facebook_likes"].mean())
    data["actor_3_facebook_likes"].replace(np.nan,avg_actor3_likes,inplace=True)
    data["actor_2_facebook_likes"].replace(np.nan,avg_actor2_likes,inplace=True)
    data["actor_1_facebook_likes"].replace(np.nan,avg_actor1_likes,inplace=True)
    
    # actor 2,3 replaced with mode
    mode_actor2=data['actor_2_name'].value_counts().idxmax()
    data['actor_2_name'].replace(np.nan,mode_actor2,inplace=True)
    mode_actor2=data['actor_3_name'].value_counts().idxmax()
    data['actor_3_name'].replace(np.nan,mode_actor2,inplace=True)
    
    # actor1 missing values dropped
    data.dropna(subset=["actor_1_name"],axis=0,inplace=True)
    data.reset_index(drop=True,inplace=True)
    
    # face number in poster replaced by average
    avg_face_number=math.floor(data["facenumber_in_poster"].mean())
    data["facenumber_in_poster"].replace(np.nan,avg_face_number,inplace=True)
    
    # plot key words replaced with mode
    plot_keywords2=data['plot_keywords'].value_counts().idxmax()
    data['plot_keywords'].replace(np.nan,plot_keywords2,inplace=True)
    
    # number of users for review replaced by average
    avg_users_review=math.floor(data["num_user_for_reviews"].mean())
    data["num_user_for_reviews"].replace(np.nan,avg_users_review,inplace=True)
    
    # replace language and country with mode
    mode_language=data['language'].value_counts().idxmax()
    data['language'].replace(np.nan,mode_language,inplace=True)
    mode_country=data['country'].value_counts().idxmax()
    data['country'].replace(np.nan,mode_country,inplace=True)
    
    # content rating, aspect ratio replaced by mode
    rating_mode=data['content_rating'].value_counts().idxmax()
    data['content_rating'].replace(np.nan,rating_mode,inplace=True)
    aspect_mode=data['aspect_ratio'].value_counts().idxmax()
    data['aspect_ratio'].replace(np.nan,aspect_mode,inplace=True)
    
    # replacing budget with average
    avg_budget=math.floor(data["budget"].mean())
    data["budget"].replace(np.nan,avg_budget,inplace=True)
    
    #dropping gross  null records 
    data.dropna(subset=["gross"],axis=0,inplace=True)
    data.reset_index(drop=True,inplace=True)
    
    # stripping last character of movie title
    data['movie_title'] = data['movie_title'].str[:-1]
    
    #print("############After cleaning###################")
    #print(data.shape)
    #print(data.isnull().sum())
    return data

## Function to find top directors

In [2]:
def top_directors(data):
    num1=1
    ch1='y'
    while(ch1.lower() =='y'):
        clear_output(wait=True) #clear previous outputs
        try:
            num1=int(input("Please enter the top number of directors you would like to see"))# getting user input for number of directors
            if (num1>0 and num1<= (data['director_name'].nunique())): # check if the user input is between 1 and distinct number of directors in the data
                #nunique() gives unique number of items in a field after dropping NA values
                top_dir=data.groupby('director_name')['gross'].sum().reset_index().nlargest(num1,'gross')
                # sum of gross values for each director is found using groupby() and sum(). reset_index() creates a new data frame with the index reset.
                #nlargest() is used on this data frame to get the top n rows based on values(gross)
                
                #plot the figure
                plt.figure(figsize=(10, 7)) # specifying figure size for the plot
                sns.barplot(data=top_dir,x='gross',y='director_name') # barplot using seaborn
                plt.xlabel('Gross Earnings',size=12,fontweight='semibold')
                plt.ylabel('Director Name',size=12,fontweight='semibold')
                plt.title("Top Directors with Gross Earnings",size=14,fontweight='bold')
                plt.show()
                ch1=input("\n would you like to try again y or n ? \n")
            else:
                ch1=input("\n sorry that choice is not available. would you like to try again y or n ?  \n")
        except ValueError: # exception handler to handle invalid values
            ch1=input("\nI don't understand that choice, would you like to try again y or n ?  \n")
    

## Function to find top actors

In [3]:
def top_actors(data):
    num2=1
    ch2='y'
    while(ch2.lower() =='y'):
        clear_output(wait=True)
        try:
            num2=int(input("Please enter the top number of actors you would like to see"))
            if(num2>0 and num2<= (data['actor_1_name'].nunique())): # check if the user input is between 1 and distinct number of actors in the data
                
                top_actor=data.groupby('actor_1_name')['gross'].sum().reset_index().nlargest(num2,'gross')
                # sum of gross values for each actor1.reset_index() creates a new data frame with the index reset.
                #nlargest() is used on this data frame to get the top n rows based on values(gross)
                
                #plot the figure
                plt.figure(figsize=(10, 7))
                sns.barplot(data=top_actor,x='gross',y='actor_1_name')
                plt.xlabel('Gross Earnings',size=12,fontweight='semibold')
                plt.ylabel('Actor Name',size=12,fontweight='semibold')
                plt.title("Top Actors with Gross Earnings",size=14,fontweight='bold')
                plt.show()
                ch2=input("\n would you like to try again y or n ? \n")
            else:
                ch2=input("\n sorry that choice is not available. would you like to try again y or n ?  \n") 
                
        except ValueError:
            ch2=input("\nI don't understand that choice, would you like to try again y or n ?  \n")

## Function for menu directors and actors

In [4]:
def best_director_actor(data):
    choice='y'
    while(choice.lower() =='y'):
        clear_output(wait=True)
        try:
            input1=int(input("Please select one of the following options: \n\n"\
                   "1. Top Directors\n"\
                   "2. Top Actors\n"))

        except ValueError:
                choice=input("\nI don't understand that choice, would you like to try again y or n? \n")
        else:
            if input1==1: # if the user chooses one go to top_directors function
                top_directors(data)
                
            elif input1==2: # if the user chooses two go to top_actors function
                top_actors(data)
                #choice=input("\n Thank you! would you like to try again y or n ? \n")
            else:
                choice=input("\nI don't understand that choice, would you like to try again y or n? \n")

## Function for film comparison

In [5]:
def film_comparison(data):
    
    data['movie_title']=data['movie_title'].str.lower() # converting movie title in data frame to lower case
    movie_list=data['movie_title'].unique() # getting unique movie names into a list
    choice='y' #setting flag for loop
    while(choice.lower() =='y'):
        clear_output(wait=True) # clears the previous output each time the loop executes
        movie_name1=input("please enter the first movie name \n").lower() # getting user input of movie name
        if movie_name1 not in movie_list: # checking if movie name entered is in the list
            choice=input("Sorry movie not found. Would you like to try again y or n \n")
        else:
            movie_name2=input("please enter the second movie name \n").lower()
            if movie_name2 not in movie_list:
                choice=input("Sorry movie not found. Would you like to try again y or n \n")
            else:
                try: #getting user input for comparison
                    print("#################################################")
                    option=int(input("please select from following options \n\n"\
                                    "1.IMDB Scores \n"\
                                    "2.Gross Earnings \n"\
                                    "3.Movie Facebook Like \n"))
                except ValueError:
                    choice=input("\nI don't understand that choice, would you like to try again y\n? \n")
                else:
                    if option==1:
                        movie_imdb=data[["movie_title","imdb_score"]][(data["movie_title"]== movie_name1) | (data["movie_title"]== movie_name2)]
                        #getting imdb scores and movie titles of both of the movies that user entered. '|' will give output for either of the movie names 
                        
                        #plot the figure
                        sns.barplot(data=movie_imdb,x='movie_title',y='imdb_score')# bar plot of result
                        plt.xlabel('Movie Name',size=12,fontweight='semibold')
                        plt.ylabel('IMDB Score',size=12,fontweight='semibold')
                        plt.title("Movie Name vs IMDB Score",size=14,fontweight='bold')
                        plt.show()
                        choice=input("Thank you!. would you like to try again y or n \n")

                    elif option==2:
                        movie_gross=data[["movie_title","gross"]][(data["movie_title"]== movie_name1) | (data["movie_title"]== movie_name2)]
                        #getting gross and movie titles of both of the movies that user entered
                        
                        #plot the figure
                        sns.barplot(data=movie_gross,x='movie_title',y='gross',palette="rocket")
                        plt.xlabel('Movie Name',size=12,fontweight='semibold')
                        plt.ylabel('Gross Earnings',size=12,fontweight='semibold')
                        plt.title("Movie Name vs Gross Earnings",size=14,fontweight='bold')
                        plt.show()
                        choice=input("Thank you!. would you like to try again y or n \n")

                    elif option==3:
                        movie_likes=data[["movie_title","movie_facebook_likes"]][(data["movie_title"]== movie_name1) | (data["movie_title"]== movie_name2)]
                        #getting movie_facebook_likes and movie titles of both of the movies that user entered
                        colors = ["#FF0B04", "#4374B3"] # setting separate color palettes for plot
                        sns.set_palette(sns.color_palette(colors))
                        
                        #plot the figure
                        sns.barplot(data=movie_likes,x='movie_title',y='movie_facebook_likes')
                        plt.xlabel('Movie Name',size=12,fontweight='semibold')
                        plt.ylabel('Facebook Likes',size=12,fontweight='semibold')
                        plt.title("Movie Name vs Facebook Likes",size=14,fontweight='bold')
                        plt.show() 
                        choice=input("Thank you!. would you like to try again y or n \n")
                    else:
                        choice=input("Sorry wrong choice given. Would you like to try again y or n \n")
                

## Function to analyze distribution of gross earnings

In [6]:
def gross_distribution(data):
    data['title_year'] = data['title_year'] .astype(np.int64) # converting year from float to int
    
    df1=data.groupby("title_year")['gross'].max().reset_index() # taking each year's maximum gross value and  putting it in a new data frame
    df1["value type"]="max"# creating a dummy column containing value max. The new data frame now has year, maximum gross value and value type
    
    df2=data.groupby("title_year")['gross'].min().reset_index()
    df2["value type"]="min"
    
    df3=data.groupby("title_year")['gross'].mean().reset_index()
    df3["value type"]="average"
    
    df_final=pd.concat([df1,df2,df3], ignore_index=True) # concatenating 3 data frames one below the other
    
    year_list=data['title_year'].value_counts().index.tolist()# getting all the title years into a list (index of value counts is year here)
    choice='y'
    while(choice.lower() =='y'):
        clear_output(wait=True)
        print("please enter an year from ",data['title_year'].min()," to ",data['title_year'].max()) # year between max and  min values
        try:
            print("#################################################")
            start_year=int(input("Please enter the start year \n"))
            end_year=int(input("Please enter the end year \n"))
        except ValueError:
                    choice=input("\nI don't understand that choice, would you like to try again y\n? \n")
        else:
            if(start_year in year_list and end_year in year_list): #checking if both years are present in the data
            
                df_final_new=df_final[df_final['title_year'].between(start_year,end_year)] # taking only max,min and avg gross value (rows) between the years that user mentioned into a new data frame
                # This data frame only has data between the years that user mentioned
                years = df_final_new[df_final_new['value type']=='max']['title_year'] # taking years from data frame into a variable for plotting
                minimum = df_final_new[df_final_new['value type']=='min']['gross'] # taking minimum values from data frame into a variable for plotting
                maximum = df_final_new[df_final_new['value type']=='max']['gross'] # taking maximum values from data frame into a variable for plotting
                average = df_final_new[df_final_new['value type']=='average']['gross'] # taking average values from data frame into a variable for plotting

                plt.figure(figsize=(10, 7)) # specifying figure size
                # Plot the data series
                plt.plot(years, minimum, label='min',color="yellow")  #line plot of years vs min gross values
                plt.plot(years, maximum, label='max')
                plt.plot(years, average, label='avg')


                # Label the x- and y-axes
                plt.xlabel('Year',size=12,fontweight='semibold') # specifying label title, font and size
                plt.ylabel('Gross Earnings',size=12,fontweight='semibold')

                # Add grid
                plt.grid(linestyle='--',color='#888888',alpha=0.4)

                # Add the title and legend
                plt.title('Minimum Maximum abd Average Gross Over the Years', size=14, fontweight='semibold')
                plt.legend(loc='best')

                # Show the figure
                plt.show()
                choice=input("Would you like to try again y or n ")
            else:
                print("Sorry year not available")
                choice=input("Would you like to try again y or n ")
        

## Function to get mean IMDB score by genre

In [7]:
def imdb_genre(data):
    data['genres']=data['genres'].str.lower()# converting genre in data frame to lower case
    choice="y"
    while (choice.lower() =='y'):
        clear_output(wait=True)
        BOLD = '\033[1m' # ANSI code for oriting bold
        END = '\033[0m'  #ANSI code for end
        print("###############################")
        print("Please find the genres available below:")
        print("###############################")
        data_genre = list(data['genres'].str.split('|'))# splitting data based on | into a list of values
        result = {x for l in data_genre for x in l}   # for l in data_genre:
                                                      #     for x in l:
                                                      #         set(x)
        print(result) # unique genrese list
        genre=input("Please select one of the genre from the above options \n").lower() # converting user input to lower case
        if genre in result:
            mean_imdb_score= round(data[data['genres'].str.contains(genre)]["imdb_score"].mean(),2)
            # checking for the user input genre in dataframe column genres using contains() and taking mean() of imdb score rounded off to 2 decimals
            print("Mean IMDB score for all movies in ",genre," genre is "+BOLD+str(mean_imdb_score)+END)
            choice=input("Thank you! Would you like to try again? y or n")
        else:
            choice=input("Sorry, genre entered is not available. Would you like to try again? y or n")

## Function to get correlation of IMDB Score

In [8]:
def imdb_score(data):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64'] # taking all possibe numeric data types into a list
    numeric_data = data.select_dtypes(include=numerics) # selecting variables with numeric data types into a new data frame
    numeric_data_corr = numeric_data.corr() # findig correaltion values. It outputs a series of column names and values
    numeric_data_corr[['imdb_score']].plot(kind='bar',color='green') # plotting a bar chart on correaltion values between imdb_score and rest of numerical variables
    plt.xlabel('Variables Names',size=12,fontweight='semibold')
    plt.ylabel('Correlation Coefficient',size=12,fontweight='semibold')
    plt.title("Bar chart showing Corelation coefficient between numerical variables",size=14,fontweight='bold')
    plt.show()
    sns.heatmap(numeric_data.corr().loc[['imdb_score'],:]) # plotting heatmap of correlation values (imdb score vs rest)
    plt.title("Correlation Matrix Heatmap",size=12,fontweight='semibold')
    plt.show()
    sns.lmplot(x="imdb_score",y="num_critic_for_reviews",data=data) # plotting scatter plot with regression line
    plt.title("Scatter plot with a regression line",size=14,fontweight='bold')
    plt.show()
    
    imdb_scatter_plots(numeric_data) # calling function for scatter plots
    

## Function to generate correlation scatter plots

In [9]:
def imdb_scatter_plots(numeric_data):
    fig, axs = plt.subplots(4, 4) # setting 4*4=16 subplots. Taking figure and axis parameters

    axs[0, 0].scatter("imdb_score","gross",data=numeric_data) # plotting scatter plot between imdb_score and gross on first subplot
    axs[0, 0].set_title('Gross') # setting title for first subplot
    axs[0, 1].scatter("imdb_score", "num_voted_users",data=numeric_data)
    axs[0, 1].set_title('Num voted users ')
    axs[0, 2].scatter("imdb_score", "num_critic_for_reviews",data=numeric_data)
    axs[0, 2].set_title('Num critic for reviews')
    axs[0, 3].scatter("imdb_score", "num_user_for_reviews",data=numeric_data)
    axs[0, 3].set_title('Num user for reviews')

    axs[1, 0].scatter("imdb_score", "duration", data=numeric_data)
    axs[1, 0].set_title('Duration')
    axs[1, 1].scatter("imdb_score", "movie_facebook_likes", data=numeric_data)
    axs[1, 1].set_title('Movie facebook likes ')
    axs[1, 2].scatter("imdb_score", "budget", data=numeric_data)
    axs[1, 2].set_title('Budget')
    axs[1, 3].scatter("imdb_score", "title_year", data=numeric_data)
    axs[1, 3].set_title('Title Year')

    axs[2, 0].scatter("imdb_score", "director_facebook_likes",data=numeric_data)
    axs[2, 0].set_title('Director facebook likes')
    axs[2, 1].scatter("imdb_score", "actor_1_facebook_likes",data=numeric_data)
    axs[2, 1].set_title('Actor 1 facebook likes')
    axs[2, 2].scatter("imdb_score", "actor_2_facebook_likes",data=numeric_data)
    axs[2, 2].set_title('Actor 2 facebook likes')
    axs[2, 3].scatter("imdb_score", "actor_3_facebook_likes",data=numeric_data)
    axs[2, 3].set_title('Actor 3 facebook likes')

    axs[3, 0].scatter("imdb_score", "aspect_ratio",data=numeric_data)
    axs[3, 0].set_title('Aspect Ratio')
    axs[3, 1].scatter("imdb_score", "cast_total_facebook_likes",data=numeric_data)
    axs[3, 1].set_title('Cast total facebook likes')
    axs[3, 2].scatter("imdb_score", "facenumber_in_poster",data=numeric_data)
    axs[3, 2].set_title('Facenumber in Poster')
    axs[3, 3].scatter("imdb_score", "imdb_score",data=numeric_data)
    axs[3, 3].set_title('IMDB Score')

    for ax in axs.flat:  # to iterate over all axes in subplots
        ax.set(xlabel='IMDB Score') # set xlabel on all subplots

    # Hide x labels and tick labels for top plots and y ticks for right plots.
    for ax in axs.flat:
        ax.label_outer()
    plt.tight_layout()    # to adjust space around sub plots
    fig.set_figheight(8)  #setting figure height
    fig.set_figwidth(8) # setting figure width
    fig.suptitle("Scatter plot of IMDB Score vs all Numerical  Variables", fontsize=14,fontweight='bold')# title for whole figure
    plt.show()

# Main Function definition

In [10]:
def main ():
    data=pd.read_csv("movie_metadata.csv")# Reading data file
    data=data_cleaning(data) # cleaning data function call
    
    choice='y'
    x=''
    while(choice.lower() =='y'and x!=6):
        #clear_output(wait=True)
        try:
            print("###################################################")
            x=int(input("Please select one of the following options: \n\n"\
               "1. Most successful directors or actros\n"\
               "2. Film comparison\n"\
               "3. Analyse the distribution of gross earnings\n"\
               "4. Genre Analysis\n"\
               "5. Earnings and IMDB scores\n"\
               "6. Exit\n"))
            print("###################################################")
        except ValueError: # checks for invalid input values of the user and prompts user to enter again
            choice=input("\nI don't understand that choice, would you like to try again y\n? \n")

        else:
            if x in range(1,6):
                if x==1:
                    best_director_actor(data) # for getting Most successful directors or actros
                elif x==2:
                    film_comparison(data) # for doing Film comparison
                elif x==3:
                    gross_distribution(data) # for Analysis of the distribution of gross earnings
                elif x==4:
                    imdb_genre(data) # for Genre Analysis
                else:
                    imdb_score(data)# for finding relationship between imdb_score and other numerical variables
            elif x==6:
                exit() # to exit out of the program
            else:
                choice=input("\nI don't understand that choice, would you like to try again y\n? \n")

    print("Thank you for using the menu!")


# Main Function Call

In [12]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
pd.options.display.max_columns= None # display all columns in a data frame
import math
from IPython.display import clear_output
main()

Please select one of the following options: 

1. Top Directors
2. Top Actors
3

I don't understand that choice, would you like to try again y or n? 
n
###################################################
Please select one of the following options: 

1. Most successful directors or actros
2. Film comparison
3. Analyse the distribution of gross earnings
4. Genre Analysis
5. Earnings and IMDB scores
6. Exit
6
###################################################
Thank you for using the menu!
