In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from scipy import stats
import seaborn as sns
from statsmodels.stats.multicomp import pairwise_tukeyhsd


In [None]:
df = pd.read_csv('netflix_ratings_data.csv', sep = ',')

In [None]:
df['splitted_genre'] = df['genres'].str.split(',')
genre_df = pd.DataFrame({'col':df['splitted_genre']})
genre_df = genre_df.explode('col')

In [None]:
genre_df = genre_df.drop_duplicates()
genre_df = genre_df.drop([372])
genre_data = genre_df.reset_index(drop=True)

In [None]:
title_genre = df[['title','splitted_genre']]

In [None]:
title_genre_2 = title_genre.copy()
title_genre_2 =title_genre_2.explode('splitted_genre') 

In [None]:
genre_title_count = title_genre_2.groupby('splitted_genre').size().reset_index(name='counts')

In [None]:
genre_title_count = genre_title_count.sort_values(by = 'counts', ascending = True)

In [None]:
genre_title_count

In [None]:
#Reference: https://www.geeksforgeeks.org/bar-plot-in-matplotlib/
genre = genre_title_count['splitted_genre']
number_of_titles = genre_title_count['counts']
fig, ax = plt.subplots(figsize =(25, 17))
ax.barh(genre, number_of_titles)
ax.xaxis.set_ticks_position('none')
ax.yaxis.set_ticks_position('none')
ax.xaxis.set_tick_params(pad = 5)
ax.yaxis.set_tick_params(pad = 10)
ax.grid(b = True, color ='blue',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.4)
for i in ax.patches:
    plt.text(i.get_width()+0.2, i.get_y()+0.5,
             str(round((i.get_width()), 2)),
             fontsize = 14, fontweight ='bold',
             color ='grey')
ax.set_title('Distribution of movies and TV shows by genre', loc = 'center',fontsize = 22,)
plt.rc('xtick', labelsize=20) 
plt.rc('ytick', labelsize=20) 
plt.savefig('bar_plot_title_distribution.png')
plt.show()



In [None]:
title_genre_4 = df[['splitted_genre','averageRating', 'year']]
title_genre_4 = title_genre_4.explode('splitted_genre')
yearly_groupped = title_genre_4.groupby('year').size().reset_index(name = 'count')

In [None]:
yearly_groupped = yearly_groupped[yearly_groupped['year'] > 2000]
subset_year = title_genre_4[title_genre_4['year']>2008]
subset_year = subset_year.set_index('year')

In [None]:
subset_year = subset_year.groupby(['splitted_genre','year'])['averageRating'].mean().reset_index()


In [None]:
subset_year =subset_year.set_index('splitted_genre')

In [None]:
subset_year_2 = subset_year.loc[['Action', 'Comedy','Drama','Documentary','Crime',
                                 'Romance','Thriller','Adventure','Animation']].reset_index()

In [None]:
list_p =[]
genre_list = ['Action', 'Comedy','Drama','Documentary','Crime',
                                 'Romance','Thriller','Adventure','Animation']
for i in genre_list:
    subset_genre = subset_year_2[subset_year_2['splitted_genre']==i] 
    normal_cal = stats.normaltest(subset_genre['averageRating']).pvalue
    print(i, normal_cal)


In [None]:
#We can see that Comedy and Crime data is not normal 
comedy_df = subset_year_2[subset_year_2['splitted_genre']=='Comedy'].drop(columns=['year'])
crime_df = subset_year_2[subset_year_2['splitted_genre']=='Crime'].drop(columns=['year'])

fig, axes = plt.subplots(1, 2, figsize=(10, 3), sharey=True, dpi=100)
sns.distplot(comedy_df['averageRating'] , color="dodgerblue", ax=axes[0], axlabel='Comedy')
sns.distplot(crime_df['averageRating'] , color="deeppink", ax=axes[1], axlabel='Crime')


In [None]:
genre_list = ['Action', 'Comedy','Drama','Documentary','Crime',
                                 'Romance','Thriller','Adventure','Animation']
action_df = subset_year_2[subset_year_2['splitted_genre']=='Action'].drop(columns=['year'])
drama_df = subset_year_2[subset_year_2['splitted_genre']=='Drama'].drop(columns=['year'])
documentary_df = subset_year_2[subset_year_2['splitted_genre']=='Documentary'].drop(columns=['year'])
romance_df = subset_year_2[subset_year_2['splitted_genre']=='Romance'].drop(columns=['year'])
thriller_df = subset_year_2[subset_year_2['splitted_genre']=='Thriller'].drop(columns=['year'])
adventure_df = subset_year_2[subset_year_2['splitted_genre']=='Adventure'].drop(columns=['year'])
animation_df = subset_year_2[subset_year_2['splitted_genre']=='Animation'].drop(columns=['year'])


In [None]:
initial_levene_p = stats.levene(action_df['averageRating'], drama_df['averageRating'], documentary_df['averageRating'], romance_df['averageRating'],
                                thriller_df['averageRating'],adventure_df['averageRating'], animation_df['averageRating']).pvalue
print(initial_levene_p)

In [None]:
#They have equal variance=>Proceed with ANOVA test
anova = stats.f_oneway(action_df['averageRating'],      drama_df['averageRating'],
                       documentary_df['averageRating'], romance_df['averageRating'],
                       thriller_df['averageRating'],     adventure_df['averageRating'], 
                       animation_df['averageRating'])
print("Anova p-value: ", anova.pvalue)
data = pd.DataFrame({'action': action_df['averageRating'].tolist(),
                        'drama': drama_df['averageRating'].tolist(),
                        'documentary': documentary_df['averageRating'].tolist(),
                        'romance': romance_df['averageRating'].tolist(),
                        'thriller': thriller_df['averageRating'].tolist(),
                        'adventure': adventure_df['averageRating'].tolist(),
                        'animation': animation_df['averageRating'].tolist()})

melted = pd.melt(data)
posthoc = pairwise_tukeyhsd(
    melted['value'], melted['variable'],
    alpha=0.05)
print(posthoc)
ax = plt.axes()
ax.yaxis.label.set_size(16)
ax.xaxis.label.set_size(18)
ax.grid(b = True, color ='blue',
        linestyle ='-.', linewidth = 0.5,
        alpha = 0.4)
fig = posthoc.plot_simultaneous(ax=ax, figsize=(20, 14))
fig.suptitle('Average score of selected genre comparison from 2008 to 2021', fontsize=25)
fig.savefig("posthoc.png")