<div style="display: flex;">
<img src="https://conocedores.com/wp-content/uploads/2016/06/netflixn.jpg" alt="Netflix logo" width=500 />
<img src="https://qubefilm.com/wp-content/uploads/2021/03/imdb-logo.png" alt="IMDb logo" width=500 />
</div>

<h1 style="font-size: 80px; font-weight: bold; text-align: center;">PRELIMINARY EDA</h1>

In [1]:
import pandas as pd

# Load the data
df = pd.read_excel(
    io='../../data/FULLY_COMBINED_DATASET.xlsx',
    sheet_name='FULLY_COMBINED_DATASET',
    engine='openpyxl',
    usecols=[
        'tconst',
        'primaryTitle',
        'originalTitle',
        'startYear',
        'runtimeMinutes',
        'genres',
        'averageRating',
        'numVotes',
        'nfShowId',
        'type',
        'title',
        'director',
        'cast',
        'country',
        'dateAdded',
        'rating',
        'listedIn',
        'description',
        'availableGlobally',
        'hoursViewed',
    ]
).drop_duplicates(
    ignore_index=True
).replace(
    "\\N",
    ''
).astype({
    'startYear': 'int32',
    'runtimeMinutes': 'int32',
    'averageRating': 'float64',
    'numVotes': 'int32',
    'dateAdded': 'datetime64[ns]',
    'hoursViewed': 'int32',
    })

df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1858 entries, 0 to 1857
Data columns (total 20 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   tconst             1858 non-null   object        
 1   primaryTitle       1858 non-null   object        
 2   originalTitle      1858 non-null   object        
 3   startYear          1858 non-null   int32         
 4   runtimeMinutes     1858 non-null   int32         
 5   genres             1858 non-null   object        
 6   averageRating      1858 non-null   float64       
 7   numVotes           1858 non-null   int32         
 8   nfShowId           1858 non-null   object        
 9   type               1858 non-null   object        
 10  title              1858 non-null   object        
 11  director           1804 non-null   object        
 12  cast               1770 non-null   object        
 13  country            1776 non-null   object        
 14  dateAdde

In [None]:
df.describe(include='all').T

# Let's look at some frequency and density graphs

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns


# Create a pairplot
sns.pairplot(
    data=df,
    kind='scatter',
    hue='availableGlobally',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.8,
        'linewidth': 3
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=2
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

In [None]:
gS = df.listedIn.value_counts().head(5).index.to_list()
splot_df = df.loc[df.listedIn.isin(gS), :].astype({'listedIn': 'object'}).astype({'listedIn': 'category'})
list(splot_df.listedIn.cat.categories)

In [None]:
sns.pairplot(splot_df,
             hue='listedIn',
             kind='scatter',
             plot_kws={'alpha': 0.5},
             palette=sns.color_palette('husl', n_colors=5)
             ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

## Let's create separate subsets for movie and tv content, respectively.

In [None]:
# Create copies of the dataframe for each type
tv_df = df[df['type'] == 'tv show'].copy()
mv_df = df[df['type'] == 'movie'].copy()

## We'll run pairplot again using just the tv data.

In [None]:
# Create a pairplot
sns.pairplot(
    data=tv_df.loc[
        (
            tv_df['runtimeMinutes'] >= 40
        ) & (
            tv_df['runtimeMinutes'] <= 180
        ) & (
            tv_df['numVotes'] <= 1000000
        ) & (
            tv_df['startYear'] >= 2010
        ) & (
            tv_df['numVotes'] >= 1940
        )
    ],
    kind='scatter',
    hue='availableGlobally',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.8,
        'linewidth': 3
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=2
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

## Now just films

In [None]:
# Create a pairplot
sns.pairplot(
    data=mv_df.loc[
        (
            mv_df['runtimeMinutes'] >= 40
        ) & (
            mv_df['runtimeMinutes'] <= 180
        ) & (
            mv_df['numVotes'] <= 1000000
        ) & (
            mv_df['startYear'] >= 2010
        ) & (
            mv_df['numVotes'] >= 1940
        )
    ],
    kind='scatter',
    hue='availableGlobally',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.8,
        'linewidth': 3
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=2
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

## Hued on 'country'

In [None]:
# create a temporary dataframe with the top 10 countries
tmpdf = df.loc[
    df['country'].isin(
        df.country.value_counts().head(5).index
    ),
    :
].astype({
    'country': 'object'
}).astype({
    'country': 'category'
})


# Create a pairplot
sns.pairplot(
        data=tmpdf.loc[
        (
            df['runtimeMinutes'] >= 40
        ) & (
            df['runtimeMinutes'] <= 180
        ) & (
            df['numVotes'] <= 1000000
        ) & (
            df['startYear'] >= 2010
        ) & (
            df['numVotes'] >= 1940
        )
    ],
    kind='scatter',
    hue='country',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.8,
        'linewidth': 3
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=5
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

In [None]:
horror_tv_df = tv_df[tv_df['listedIn'].str.contains('horror',
                                                    na=False,
                                                    case=False)].copy()
horror_tv_df.index.size

In [None]:
horror_mv_df = mv_df[mv_df['listedIn'].str.contains('horror',
                                                    na=False,
                                                    case=False)].copy()
horror_mv_df.index.size

# Looking at just the movies, how stable is the 'availableGlobally' phenomenon?

In [None]:
mv_df.plot(kind='scatter',
        x='numVotes',
        y='averageRating',
        alpha=0.8,
        c='availableGlobally',
        cmap='viridis',
        figsize=(12, 8),
        title='Number of Votes vs. Average Rating'
)

plt.show();

# And for only TV content?

In [None]:
tv_df.plot(kind='scatter',
        x='numVotes',
        y='averageRating',
        alpha=0.8,
        c='availableGlobally',
        cmap='viridis',
        figsize=(12, 8),
        title='Number of Votes vs. Average Rating'
)

plt.show();

<hr />

<hr />

# We'll distill the 'listedIn' column into the following macro-categories: _action_, _comedy_, _documentary_, _drama_, _family_, _horror_, _romance_.  There will initially be overlap amongst the subsets, but this will be addressed, as needed, in an ongoing fashion.

In [None]:
action_df = df.loc[df['listedIn'].str.contains('action &'), :].copy().astype({'listedIn': 'object'})
action_df.loc[:, 'listedIn'] = 'action'

comedy_df = df.loc[df['listedIn'].str.contains('comedies'), :].copy().astype({'listedIn': 'object'})
comedy_df.loc[:, 'listedIn'] = 'comedy'

documentary_df = df.loc[df['listedIn'].str.contains('documentaries'), :].copy().astype({'listedIn': 'object'})
documentary_df.loc[:, 'listedIn'] = 'documentary'

drama_df = df.loc[df['listedIn'].str.contains('drama'), :].copy().astype({'listedIn': 'object'})
drama_df.loc[:, 'listedIn'] = 'drama'

family_df = df.loc[df['listedIn'].str.contains('children'), :].copy().astype({'listedIn': 'object'})
family_df.loc[:, 'listedIn'] = 'family'

horror_df = df.loc[df['listedIn'].str.contains('horror'), :].copy().astype({'listedIn': 'object'})
horror_df.loc[:, 'listedIn'] = 'horror'

romance_df = df.loc[df['listedIn'].str.contains('romance'), :].copy().astype({'listedIn': 'object'})
romance_df.loc[:, 'listedIn'] = 'romance'

macro_df = pd.concat([
    action_df,
    comedy_df,
    documentary_df,
    drama_df,
    family_df,
    horror_df,
    romance_df
]).astype({'listedIn': 'category'})

macro_df.sample(5)

In [None]:
macro_df[macro_df['listedIn'] == 'drama']

# Let's see if this broad brush approach yields any useful insights.

In [None]:
# Create a pairplot
sns.pairplot(
    data=macro_df.loc[
        (
            macro_df['runtimeMinutes'] >= 40
        ) & (
            macro_df['runtimeMinutes'] <= 180
        ) & (
            macro_df['numVotes'] <= 1000000
        ) & (
            macro_df['startYear'] >= 2010
        ) & (
            macro_df['numVotes'] >= 1940
        )
    ],
    kind='scatter',
    hue='listedIn',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.5,
        'linewidth': 4
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=6
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

# Let's increase the contrast between genres by removing duplicates among categories.  This is a first, rough pass, and should be refined based on initial examination.

In [None]:
dropped = macro_df.drop('listedIn', axis=1).drop_duplicates()

dropped_df = pd.merge(
    left=(l := dropped.iloc[:, :8]),
    right=(r := macro_df.iloc[:, 8:]),
    left_on=l.index,
    right_on=r.index
).drop_duplicates(
    subset=[
        'title'
    ],
    keep=False,
    ignore_index=True
).drop('key_0',
       axis=1
      )

dropped_df.info(
    memory_usage='deep'
)

In [None]:
dropped_df.primaryTitle.duplicated().sum()

## Pairplots

### hue = 'type'

In [None]:
# Create a pairplot
sns.pairplot(
    data=dropped_df.loc[
        (
            dropped_df['runtimeMinutes'] >= 40
        ) & (
            dropped_df['runtimeMinutes'] <= 180
        ) & (
            dropped_df['numVotes'] <= 1000000
        ) & (
            dropped_df['startYear'] >= 2010
        ) & (
            dropped_df['numVotes'] >= 1940
        )
    ],
    kind='scatter',
    hue='type',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.5,
        'linewidth': 4
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=2
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

### hue = 'availableGlobally'

In [None]:
# Create a pairplot
sns.pairplot(
    data=dropped_df.loc[
        (
            dropped_df['runtimeMinutes'] >= 40
        ) & (
            dropped_df['runtimeMinutes'] <= 180
        ) & (
            dropped_df['numVotes'] <= 1000000
        ) & (
            dropped_df['startYear'] >= 2010
        ) & (
            dropped_df['numVotes'] >= 1940
        )
    ],
    kind='scatter',
    hue='availableGlobally',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.5,
        'linewidth': 4
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=2
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

### hue = 'listedIn'

In [None]:
# Create a pairplot
sns.pairplot(
    data=dropped_df.loc[
        (
            dropped_df['runtimeMinutes'] >= 40
        ) & (
            dropped_df['runtimeMinutes'] <= 180
        ) & (
            dropped_df['numVotes'] <= 1000000
        ) & (
            dropped_df['startYear'] >= 2010
        ) & (
            dropped_df['numVotes'] >= 1940
        )
    ],
    kind='scatter',
    hue='listedIn',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.5,
        'linewidth': 4
    },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=6
        )
    ).map_lower(sns.kdeplot, levels=3, color=".2")

plt.show();

# Preliminary Zoom on Dense Clusters

## Movie Titles

In [None]:
# Create a pairplot
sns.pairplot(
    data=dropped_df.loc[(
        dropped_df['type'] == 'movie'
    ) & (
        dropped_df['averageRating'] >= 5
    ) & (
        dropped_df['runtimeMinutes'] >= 80
    ) & (
        dropped_df['runtimeMinutes'] <= 150
    ) & (
        dropped_df['hoursViewed'] < 2000000
    ) & (
        dropped_df['hoursViewed'] > 0
    ) & (
        dropped_df['numVotes'] > 0
    ) & (
        dropped_df['numVotes'] <= 15000
    ) & (
        dropped_df['startYear'] >= 2020
    ), :].drop(
        labels='type',
        axis=1
    ),
    kind='scatter',
    hue='listedIn',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.5,
        'linewidth': 4,
        },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=6
        )
    ).map_lower(
        sns.kdeplot,
        levels=3,
        color=".2",
        warn_singular=False
    )

plt.show();

## TV Titles

In [None]:
# Create a pairplot
sns.pairplot(
    data=dropped_df.astype({'type': 'object'})[dropped_df['type'] == 'tv show'].loc[
    (
        dropped_df['hoursViewed'] < 8000000
    ) & (
        dropped_df['hoursViewed'] > 0
    ) & (
        dropped_df['numVotes'] > 0
    ) & (
        dropped_df['numVotes'] <= 50000
    ), :].drop(
        labels='type',
        axis=1
    ),
    kind='scatter',
    hue='listedIn',
    diag_kind='hist',
    plot_kws={
        'alpha': 0.5,
        'linewidth': 4,
        },
    corner=True,
    palette=sns.color_palette(
        'husl',
        n_colors=6
        )
    ).map_lower(
        sns.kdeplot,
        levels=3,
        color=".2",
        warn_singular=False
    )

plt.show();

<h1>TODO:</h1>
<ul>
    <li><h3>Reconfigure categories for macro_df</h3></li>
    <li><h3>Reconfigure categories for dropped_df</h3></li>
    <li><h3>Investigate density clusters identified by kde mapping</h3></li>
</ul>

<hr />

# Explore and Merge Cast Ratings Data

In [None]:
cast_df = pd.read_excel('../../data/CAST_LIST-firstHalf.xlsx', engine='openpyxl').drop_duplicates(ignore_index=True).replace(
    '\\N',
    ''
).dropna( 
).astype({
    'actor': 'str',
    'nconst': 'str',
    'rating': 'int32',
    'ratingChange': 'int32'
})

# Make the actor column lowercase
cast_df['actor'] = cast_df['actor'].str.lower()

# Check data types and memory usage
cast_df.info(memory_usage='deep')

In [None]:
cast_df['ratingChange'].plot(
    kind='box',
    figsize=(12, 8),
    title='Rating and Rating Change Boxplot'
);

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(1, 1)

# Create a histogram
sns.histplot(
    data=cast_df,
    x='rating',
    kde=True,
    bins=20,
    color='skyblue',
    alpha=0.8,
    ax=ax
)

sns.histplot(
    data=cast_df,
    x='ratingChange',
    kde=True,
    bins=20,
    color='red',
    alpha=0.8,
    ax=ax
)

# Set the title and labels
ax.set_title('Rating and Rating Change Distribution')
ax.set_xlabel('Rating and Rating Change')
ax.set_ylabel('Frequency')

plt.show();

In [None]:
plt.plot(
    df.loc[(df['type'] == 'movie') & (df['runtimeMinutes'] <= 120)][['runtimeMinutes',
        'numVotes']]
)

# Set the title and labels
plt.title('Runtime vs. Number of Votes')
plt.xlabel('Runtime')
plt.ylabel('Number of Votes')

plt.show();

In [None]:
top5Directors = df.director.value_counts().head(5).index.to_list()

director_df = df.loc[df.director.isin(top5Directors), :].copy().astype({'director': 'category'})

director_df.loc[df.runtimeMinutes <= 120, :].plot(
    kind='scatter',
    x='runtimeMinutes',
    y='numVotes',
    alpha=0.8,
    c='director',
    cmap='viridis',
    figsize=(12, 8),
    title='Runtime vs. Number of Votes'
)

plt.show();

In [None]:
pltdf = df.astype({'type': 'category'}).copy()

pltdf.plot(
    kind='scatter',
    x='runtimeMinutes',
    y='numVotes',
    alpha=0.8,
    c='type',
    cmap='viridis',
    figsize=(12, 8),
    title='Number of Votes vs. Average Rating'
)

plt.show();

In [2]:
from sklearn.decomposition import PCA

# Create a PCA model
pca = PCA()

# Fit the model
pca.fit(df)

ModuleNotFoundError: No module named 'sklearn'

In [5]:
from sklearn import *

ModuleNotFoundError: No module named 'sklearn'