In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
pd.set_option('display.max_rows',500)

### General functions

In [4]:
def df_frag(column, value):
    return data[data[column] == value]

In [5]:
def get_nulls(df):
    return df[df['imdb_rating'].isnull()].shape

In [150]:
def drop_column(df,column):
    return df.drop(column, axis=1, inplcae = True)

### General Analysis

In [6]:
data = pd.read_csv('data\\notebook_df\\final_df.csv')

### Division between Series and Movies

In [11]:
movies = df_frag('type', 'Movie')

In [12]:
movies.shape

(14997, 14)

In [13]:
tv_shows = df_frag('type', 'TV Show')

In [126]:
tv_shows.shape

(4928, 14)

### Division bewteen streaming platforms

#### Netflix

In [8]:
netflix = df_frag('platform', 'netflix')

In [128]:
netflix.shape

(8807, 14)

In [132]:
netflix_len = len(netflix)

In [129]:
get_nulls(netflix)

(1849, 14)

#### Amazon

In [10]:
amazon = df_frag('platform', 'amazon')

In [11]:
amazon.shape

(9668, 14)

In [12]:
amazon_nulls = pd.read_csv('data\\notebook_df\\amazon_null.csv')

In [13]:
amazon_f = pd.concat([amazon, amazon_nulls])

In [14]:
amazon = amazon_f.drop_duplicates(subset='title', keep='last')

In [15]:
amazon.shape

(9668, 14)

In [16]:
get_nulls(amazon)

(1983, 14)

#### Disney

In [17]:
disney = df_frag('platform', 'disney')

In [18]:
disney.shape

(1450, 14)

In [20]:
get_nulls(disney)

(339, 14)

#### Final Data 

In [21]:
final_data = pd.concat([netflix, amazon, disney])

In [22]:
get_nulls(final_data)

(4171, 14)

### Analysis

In [24]:
final_data.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,platform,imdb_rating
0,s276,TV Show,The Kingdom,,"Chino Darín, Nancy Dupláa, Joaquín Furriel, Pe...",Argentina,"August 13, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, Spanis...","After his running mate's murder, a controversi...",netflix,
1,s277,TV Show,Valeria,Inma Torrente,"Diana Gómez, Silma López, Paula Malia, Teresa ...",Spain,"August 13, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, Spa...",A writer in creative and marital crises finds ...,netflix,
2,s278,TV Show,AlRawabi School for Girls,,"Andria Tayeh, Rakeen Sa'ad, Noor Taher, Yara M...",Jordan,"August 12, 2021",2021,TV-14,1 Season,"International TV Shows, TV Dramas, Teen TV Shows",The bullied outcasts at prestigious Al Rawabi ...,netflix,74.0
3,s279,Movie,Lokillo: Nothing's the Same,Julián Gaviria,Lokillo Florez,Colombia,"August 12, 2021",2021,TV-MA,64 min,Stand-Up Comedy,"Through songs and puns, comedian Lokillo Flore...",netflix,46.0
4,s280,Movie,Monster Hunter: Legends of the Guild,Steven Yamamoto,"Ben Rausch, Erica Lindbeck, Dante Basco, Brand...","United States, Japan","August 12, 2021",2021,TV-PG,59 min,"Action & Adventure, Anime Features, Children &...",In a world where humans and fearsome monsters ...,netflix,52.0


In [25]:
final_data.dtypes

show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
rating          object
duration        object
listed_in       object
description     object
platform        object
imdb_rating     object
dtype: object

#### Data count

In [28]:
final_data.type.value_counts()

Movie      14997
TV Show     4928
Name: type, dtype: int64

In [27]:
countries_list = list(final_data.country.unique())
# Apart from individual countries there are title that have a combination of countries 

In [29]:
year_list = list(final_data.release_year.unique())

We need to change the format of imdb_rating to float

In [32]:
# The rating extrcated from IMdb came wiht a ",", so before changing its type to float we need to replace it by "."
final_data.imdb_rating = final_data.imdb_rating.replace(',','.',regex = True)

In [33]:
final_data = final_data.astype({'imdb_rating':'float64'})

In [34]:
# Lets round the rating to 1 decimal number 
final_data = final_data.round({'imdb_rating':1})

In [43]:
final_data.to_csv(r'C:\Users\juanp\proyectos_varios\streaming_platforms\data\notebook_df\streaming_platforms_data.csv' , index = False)

### Platform quality 

In [35]:
final_data.groupby('platform').agg({'imdb_rating':
                                    'mean'}).sort_values('imdb_rating', ascending = False)

Unnamed: 0_level_0,imdb_rating
platform,Unnamed: 1_level_1
disney,6.641494
netflix,6.465536
amazon,6.180247


In [36]:
final_data[final_data.type=='TV Show'].groupby('platform').agg({'imdb_rating':
                                                                'mean'}).sort_values('imdb_rating', ascending = False)

Unnamed: 0_level_0,imdb_rating
platform,Unnamed: 1_level_1
disney,7.073504
amazon,7.058696
netflix,7.057549


In [37]:
final_data[final_data.type=='Movie'].groupby('platform').agg({'imdb_rating':
                                                              'mean'}).sort_values('imdb_rating', ascending = False)

Unnamed: 0_level_0,imdb_rating
platform,Unnamed: 1_level_1
disney,6.526226
netflix,6.270837
amazon,5.987978


### What platform has the best title

In [38]:
top_100_movies = final_data[final_data.type == 'Movie'].sort_values('imdb_rating', ascending = False).head(500)

In [39]:
top_100_shows = final_data[final_data.type == 'TV Show'].sort_values('imdb_rating', ascending = False).head(500)

In [40]:
top_100_movies.groupby('platform').agg({'title':'count'})

Unnamed: 0_level_0,title
platform,Unnamed: 1_level_1
amazon,259
disney,35
netflix,206


In [41]:
top_100_shows.groupby('platform').agg({'title':'count'})

Unnamed: 0_level_0,title
platform,Unnamed: 1_level_1
amazon,225
disney,36
netflix,239


### Differentiation for analysis

In [242]:
amazon_t = final_data[(final_data.platform == 'amazon') & (~final_data.imdb_rating.isnull())]

In [252]:
amazon_movie_len = len(amazon_t[amazon_t.type == 'Movie'])

In [253]:
amazon_show_len = len(amazon_t[amazon_t.type == 'TV Show'])

In [245]:
netflix_t = final_data[(final_data.platform == 'netflix') & (~final_data.imdb_rating.isnull())]

In [256]:
netflix_movie_len = len(netflix_t[netflix_t.type == 'Movie'])

In [255]:
netflix_show_len = len(netflix_t[netflix_t.type == 'TV Show'])

In [244]:
disney_t = final_data[(final_data.platform == 'disney') & (~final_data.imdb_rating.isnull())]

In [258]:
disney_movie_len = len(disney_t[disney_t.type == 'Movie'])

In [257]:
disney_show_len = len(disney_t[disney_t.type == 'TV Show'])

### Functions for creating the final table

In [221]:
def quantile_results(df, title_type, len_df):
    movies_1q = df[(df.type == title_type) & (df.imdb_rating > 7.5)].groupby('platform').agg({
    'title':lambda series: (series).count()/len_df * 100})
    movies_2q = df[(df.type == title_type) & (df.imdb_rating <= 7.5) 
                       & (df.imdb_rating > 5.0)].groupby('platform').agg({'title':lambda series: (series).count()/len_df * 100})
    movies_3q = df[(df.type == title_type) & (df.imdb_rating <= 5.0) 
                       & (df.imdb_rating > 2.5)].groupby('platform').agg({'title':lambda series: (series).count()/len_df * 100})
    movies_4q = df[(df.type == title_type) & (df.imdb_rating <= 2.5)].groupby('platform').agg({
    'title':lambda series: (series).count()/len_df * 100})
    
    movies_q_complete = movies_1q.join(movies_2q, lsuffix='_1q', rsuffix='_2q').join(
    movies_3q, lsuffix= '_2q',rsuffix='_3q').join(movies_4q, lsuffix='_3q', rsuffix='_4q')
    
    return movies_q_complete
    

In [259]:
amazon_movie_q  = quantile_results(amazon_t, 'Movie', amazon_movie_len)
netflix_movie_q = quantile_results(netflix_t, 'Movie', netflix_movie_len)
disney_movie_q = quantile_results(disney_t, 'Movie', disney_movie_len)

In [260]:
amazon_show_q  = quantile_results(amazon_t, 'TV Show', amazon_show_len)
netflix_show_q = quantile_results(netflix_t, 'TV Show', netflix_show_len)
disney_show_q = quantile_results(disney_t, 'TV Show', disney_show_len)

In [261]:
amazon_movie_q, netflix_movie_q, disney_movie_q

(          title_1q   title_2q  title_3q  title_4q
 platform                                         
 amazon    8.459176  54.632711  16.56002  1.036601,
            title_1q   title_2q  title_3q  title_4q
 platform                                          
 netflix   11.497326  74.675325  13.42628   0.40107,
           title_1q   title_2q  title_3q  title_4q
 platform                                         
 disney    12.99886  79.475485  7.411631  0.114025)

In [262]:
amazon_show_q, netflix_show_q, disney_show_q

(           title_1q   title_2q  title_3q  title_4q
 platform                                          
 amazon    27.400216  42.448759   4.20712  0.377562,
            title_1q   title_2q  title_3q  title_4q
 platform                                          
 netflix   33.217189  62.195122  4.297329   0.29036,
            title_1q   title_2q  title_3q  title_4q
 platform                                          
 disney    38.461538  56.410256  5.128205       NaN)