Import the necessary libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12,8)

Read in the movies.csv data

In [None]:
# read in the movies.csv dataset
df = pd.read_csv('C:/JPEM_Git_Main/JPEM/JPEM_DAPP/data/movies.csv')
df.head()

Looking for missing data

In [None]:
for col in df.columns:
    percent_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, percent_missing))

Data Types for our Columns
- Budget and Gross do not need to be float64

In [None]:

df.dropna(subset=['budget', 'gross'], inplace=True)
df['budget'] = df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')
df.dtypes

Year and Released don't always match. Lets fix that by extracting "year" from released

In [None]:
# Function to extract the year
def extract_year(date_str):
    comma_pos = date_str.find(',')
    return date_str[comma_pos + 2:comma_pos + 6]

# Apply the function to the Series
df['released_year'] = df['released'].apply(extract_year)

df.head()

In [None]:
# df.sort_values(by=['gross'], inplace=False, ascending=False)

In [None]:
# pd.set_option('display.max_rows', None)

Drop Duplicates

In [None]:
df['company'].drop_duplicates().sort_values(ascending=False)

Visualizing Relationship Between Budget and Gross Revenue

In [None]:
# Scatterplot of Budget vs Gross Revenue

plt.scatter(x=df['budget'], y=df['gross'])
plt.xlabel('Budget')
plt.ylabel('Gross Revenue')
plt.title('Budget versus Gross Revenue')
plt.show()

In [None]:
# Regression Plot using Seaborn

sns.regplot(data=df, x=df['budget'], y=df['gross'], scatter_kws={"color":"red"}, line_kws={"color":"blue"}).set(title='Gross Revenue vs Budget', xlabel='Budget', ylabel="Gross Revenue")


In [None]:
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
correlation_matrix = numeric_df.corr(method='pearson') # pearson, kendall, spearman

# Print the correlation matrix
print(correlation_matrix)

Visualizing the Correlation Matrix using a Heatmap

In [None]:
# Assuming df is your DataFrame
# Select only the numeric columns
numeric_df = df.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
correlation_matrix = numeric_df.corr(method='pearson')

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", linewidths=0.5)

# Display the heatmap
plt.title('Correlation Matrix Heatmap of Movie Database')
plt.show()

Looking at Company Information (currently non-numeric)

In [None]:
df_numeric = df

for col_name in df_numeric.columns:
    if(df_numeric[col_name]).dtype == 'object':
        df_numeric[col_name] = df_numeric[col_name].astype('category')
        df_numeric[col_name] = df_numeric[col_name].cat.codes
        
df_numeric

In [None]:
# Assuming df is your DataFrame
# Select only the numeric columns
numeric_df = df_numeric.select_dtypes(include=['float64', 'int64'])

# Compute the correlation matrix
correlation_matrix = df_numeric.corr(method='pearson')

# Create a heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", linewidths=0.5)

# Display the heatmap
plt.title('Correlation Matrix Heatmap of Movie Database')
plt.show()

In [102]:
# Unstacking 
correlation_matrix = df_numeric.corr()
corr_pairs = correlation_matrix.unstack()
corr_pairs

sorted_pairs = corr_pairs.sort_values()
sorted_pairs

genre          budget          -0.368900
budget         genre           -0.368900
genre          gross           -0.244380
gross          genre           -0.244380
rating         budget          -0.186892
budget         rating          -0.186892
rating         gross           -0.168814
gross          rating          -0.168814
votes          genre           -0.136457
genre          votes           -0.136457
company        rating          -0.082189
rating         company         -0.082189
country        released_year   -0.075073
released_year  country         -0.075073
company        genre           -0.073293
genre          company         -0.073293
year           genre           -0.073026
genre          year            -0.073026
year           country         -0.066662
country        year            -0.066662
genre          runtime         -0.060186
runtime        genre           -0.060186
released_year  genre           -0.059733
genre          released_year   -0.059733
score          c

In [101]:
# displaying sorted_pairs with high correlation
high_corr = sorted_pairs[(sorted_pairs > 0.5) & (sorted_pairs < 0.9)]
high_corr

# Based on these results the company has little to no correlation to the gross of the film.

gross   votes     0.614904
votes   gross     0.614904
budget  gross     0.740395
gross   budget    0.740395
dtype: float64