# IMDB Movie Analysis

In [None]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

sns.set(color_codes=True)
sns.set(style="ticks")

data_frame = pd.read_csv("movie_metadata.csv")
data_frame.head()

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder

for column_name in data_frame.columns:
    le = LabelEncoder()
    column = data_frame[column_name]
    if column_name == 'plot_keywords':
        mlb = MultiLabelBinarizer()
        keywords = list()
        column.fillna('UNKNOWN')
        column.apply(lambda x: keywords.append(x.split('|')) if x is not np.nan else keywords.append(['UNKNOWN']))
        binarized = mlb.fit_transform(keywords)
        plt.imshow(binarized)
        class_values = pd.Series(list(binarized))
        data_frame[column_name].update(class_values)
    else:
        normalized_column = column.fillna(0) if column.dtype.kind in 'biufc' else column.fillna('UNKNOWN')

        class_values = pd.Series(list(le.fit_transform(normalized_column)))
        data_frame[column_name].update(class_values)

data_frame.head()

In [None]:
# Figures for representing some features
sns.set(font_scale = 2)
g = sns.jointplot('gross', 'imdb_score', size=12, data=data_frame, kind="kde", color="#10275F")
plt.subplots_adjust(top=0.95)
g.fig.suptitle('KDE of Imdb_Score and Gross', size=20, weight='bold')
sns.set(font_scale = 1)
# This is different from the kaggle kernel result(as in shape)

In [None]:
# Showing the pearson correlation of features
with sns.plotting_context(font_scale=1.25):
    f, ax = plt.subplots(figsize=(20, 20))
    plt.title('Pearson Correlation of Movie Features', {'weight': 'bold', 'size': 20})
    # plot_keywords features are encoded as array which requires more care to plot in this way
    new_dataframe = data_frame.drop('plot_keywords', axis=1)
    sns.heatmap(new_dataframe.astype(float).corr(), linewidths=0.25, vmax=1.0, square=True, annot=True)