In [1]:
# ********************** Python Data Correlation Project **************************
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


# # import libraries 

# YT video: https://www.youtube.com/watch?v=iPYVYBtUTyE&list=PLUaB-1hjhk8H48Pj32z4GZgGWyylqv85f&index=4
import pandas as pd
import seaborn as sns
import numpy as np

import matplotlib
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from matplotlib.pyplot import figure

%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (12 , 8)  # adjusts the configuration of the plots we will create

In [2]:
df = pd.read_csv('../input/movies/movies.csv')
df.head(3)

In [3]:
# number of columns and rows
df.shape

In [4]:
# data info, dtype, column name, non-null count (should not have any nulls)
df.info()

In [5]:
# check missing data using a for loop

# seems no missing data

for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, pct_missing))

In [6]:
# since there are null values in the set, drop null values
df = df.dropna(how = 'any', axis = 0)

In [7]:
# basic data cleaning
# data types for columns
df.dtypes

In [8]:
# released and year not always matching; so create a new column as 'year_correct'
df['year_correct'] = df['released'].astype(str).str.split(', ').str[-1].astype(str).str[:4]

In [9]:
# the new column is added to the data 
df.head(3)

In [10]:
# change the data types of budget and gross column to int
df['budget'] = df['budget'].astype('int64')
df['gross'] = df['gross'].astype('int64')

In [11]:
# now there is no null values
df.head(3)

In [12]:
# sort gross revenue in descending order
df = df.sort_values(by = ['gross'], inplace = False, ascending = False)
df.head(10)

In [13]:
# optional: display all the data (default is 20 rows)
# pd.set_option('display.max_rows', None)

In [14]:
# check if duplicates 
df['company'].drop_duplicates().sort_values(ascending = False)
# df.drop_duplicates()
df.head(10)

In [15]:
# check if there is any duplicates dropped (none)
df.shape

In [16]:
# what column (info) is most correlated to the gross revenue
# try budget - gross correlation
# try company - gross correlation

# scatter plot with budge vs. gross revenue

plt.scatter(x = df['budget'], y = df['gross']) 
plt.title ('Budget vs. Gross Earnings')
plt.xlabel('Gross Earnings')
plt.ylabel('Movie Budget')
plt.show()

In [17]:
df.head()

In [18]:
# seaborn reg plot
sns.regplot(x = 'budget', y = 'gross', data = df, scatter_kws={"color": 'blue'}, line_kws = {'color': "red"})

In [19]:
# check the correlation, corr() only works on the numercial fields
# default corr = pearson, others include kendall, spearman
df.corr(method = 'pearson')
# df.corr(method = 'spearman') # close to pearson

In [20]:
# heatmap for correlation; brighter colors mean higher corr
correlation_matrix = df.corr(method = 'pearson')
sns.heatmap(correlation_matrix, annot = True)
plt.title ('Correlation Matric for Numeric Features ')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [21]:
# check the correlation between company  and gross
# company is not numeric but can create numeric values for the companies
# numerize all non-numeric columns
df_numerized = df
for col_name in df_numerized.columns:
    if(df_numerized[col_name].dtype == 'object'):
        df_numerized[col_name] = df_numerized[col_name].astype('category') 
        df_numerized[col_name] = df_numerized[col_name].cat.codes

df_numerized.head(5)

In [22]:
# run the correlation heatmap with the numerized data
correlation_matrix = df_numerized.corr(method = 'pearson')
sns.heatmap(correlation_matrix, annot = True)
plt.title ('Correlation Matric for Numeric Features ')
plt.xlabel('Movie Features')
plt.ylabel('Movie Features')
plt.show()

In [23]:
# the above heatmap for the numerized data has a lot of info
# we could scale down a bit

df_numerized.corr()

In [24]:
correlation_mat = df_numerized.corr()
corr_pairs = correlation_mat.unstack()

# pair up the correlations
sorted_pairs = corr_pairs.sort_values()
sorted_pairs

In [25]:
# find high correlations
# seems votes and gross revenue have high correlation
# seems company and gross revenue do not have high correlation (< 0.5)
high_corr = sorted_pairs[(sorted_pairs) > 0.5]
high_corr