Notebook created by Anisa Maharani

## **Imports**

In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.cluster.hierarchy as shc
from sklearn.cluster import KMeans, AgglomerativeClustering

## **Dataset**

In [25]:
df = pd.read_csv("processed.csv")
df.head()

Unnamed: 0,show_id,type,title,director,cast,release_year,rating,duration,listed_in,description,duration_int
0,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,110.0
1,s17,Movie,Zoombies,Glenn Miller,"Marcus Anderson, Kaiwi Lyman, Andrew Asper",2016,13+,87 min,"Horror, Science Fiction",When a strange virus quickly spreads through a...,87.0
2,s18,TV Show,Zoo Babies,,Narrator - Gillian Barlett,2008,G,1 Season,"Kids, Special Interest",A heart warming and inspiring series that welc...,1.0
3,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,Zoë Coombs Marr,2020,18+,1 Season,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...,1.0
4,s20,Movie,Zoe,Drake Doremus,"Ewan McGregor, Léa Seydoux, Theo James, Mirand...",2018,R,104 min,Science Fiction,ZOE tells a tale of forbidden love between an ...,104.0


In [26]:
row_count = df.shape[0]
row_count

8746

## **Functions**

In [27]:
def concat_df(dataset, cols, prefix, count):
  first_col = prefix + '0'
  res = dataset[dataset[first_col] != ""][cols + [first_col]].dropna(subset = [first_col])

  for i in range(1, count+1):
    current_col = prefix + f'{i}'
    print(current_col)
    temp = dataset[dataset[current_col] != ""][cols + [current_col]].rename(columns={current_col:first_col})
    res = pd.concat([res, temp], ignore_index=True)

  return res

In [28]:
def print_bounds(dataset, col, init_count):
  q1 = dataset[col].quantile(q=0.25)
  q3 = dataset[col].quantile(q=0.75)
  iqr_15 = (q3-q1)*1.5
  lub = q1 - iqr_15
  rub = q3 + iqr_15

  total_lower = dataset[dataset[col] < lub][col].shape[0]
  total_upper = dataset[dataset[col] > rub][col].shape[0]
  low_outlier = (total_lower / init_count)*100
  upper_outlier = (total_upper / init_count)*100

  print(f"""
Q1: {q1}
Q3: {q3}
LUB: {lub}
RUB: {rub}
Total Lower Outliers: {total_lower} ({round(low_outlier, 2)}%)
Total Upper Outliers: {total_upper} ({round(upper_outlier, 2)}%)
Total Outliers: {total_lower + total_upper}
""")

## **Split Genre**

Reference: https://practicaldatascience.co.uk/data-science/how-to-split-a-pandas-column-string-or-list-into-separate-columns

1. Transform `listed_in` attr into a list of genre
2. Split list into columns

### **Procedure**

In [29]:
df['genre_list'] = df['listed_in'].str.split(', ')
df.head()

Unnamed: 0,show_id,type,title,director,cast,release_year,rating,duration,listed_in,description,duration_int,genre_list
0,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,110.0,"[Drama, International]"
1,s17,Movie,Zoombies,Glenn Miller,"Marcus Anderson, Kaiwi Lyman, Andrew Asper",2016,13+,87 min,"Horror, Science Fiction",When a strange virus quickly spreads through a...,87.0,"[Horror, Science Fiction]"
2,s18,TV Show,Zoo Babies,,Narrator - Gillian Barlett,2008,G,1 Season,"Kids, Special Interest",A heart warming and inspiring series that welc...,1.0,"[Kids, Special Interest]"
3,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,Zoë Coombs Marr,2020,18+,1 Season,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...,1.0,"[Comedy, Talk Show and Variety]"
4,s20,Movie,Zoe,Drake Doremus,"Ewan McGregor, Léa Seydoux, Theo James, Mirand...",2018,R,104 min,Science Fiction,ZOE tells a tale of forbidden love between an ...,104.0,[Science Fiction]


In [30]:
df_genre = pd.DataFrame(df['genre_list'].tolist()).fillna('').add_prefix('genre_')
df_genre.head()

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4
0,Drama,International,,,
1,Horror,Science Fiction,,,
2,Kids,Special Interest,,,
3,Comedy,Talk Show and Variety,,,
4,Science Fiction,,,,


In [31]:
row_list = []
for row in df['genre_list'].tolist():
  row_list.append(len(row))

count_genre = pd.Series(row_list)
count_genre

0       2
1       2
2       2
3       2
4       1
       ..
8741    1
8742    1
8743    1
8744    2
8745    3
Length: 8746, dtype: int64

In [32]:
df_genre['count_genre'] = count_genre
df_genre.head()

Unnamed: 0,genre_0,genre_1,genre_2,genre_3,genre_4,count_genre
0,Drama,International,,,,2
1,Horror,Science Fiction,,,,2
2,Kids,Special Interest,,,,2
3,Comedy,Talk Show and Variety,,,,2
4,Science Fiction,,,,,1


In [33]:
df = pd.concat([df, df_genre], axis=1)
df.head()

Unnamed: 0,show_id,type,title,director,cast,release_year,rating,duration,listed_in,description,duration_int,genre_list,genre_0,genre_1,genre_2,genre_3,genre_4,count_genre
0,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,110.0,"[Drama, International]",Drama,International,,,,2
1,s17,Movie,Zoombies,Glenn Miller,"Marcus Anderson, Kaiwi Lyman, Andrew Asper",2016,13+,87 min,"Horror, Science Fiction",When a strange virus quickly spreads through a...,87.0,"[Horror, Science Fiction]",Horror,Science Fiction,,,,2
2,s18,TV Show,Zoo Babies,,Narrator - Gillian Barlett,2008,G,1 Season,"Kids, Special Interest",A heart warming and inspiring series that welc...,1.0,"[Kids, Special Interest]",Kids,Special Interest,,,,2
3,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,Zoë Coombs Marr,2020,18+,1 Season,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...,1.0,"[Comedy, Talk Show and Variety]",Comedy,Talk Show and Variety,,,,2
4,s20,Movie,Zoe,Drake Doremus,"Ewan McGregor, Léa Seydoux, Theo James, Mirand...",2018,R,104 min,Science Fiction,ZOE tells a tale of forbidden love between an ...,104.0,[Science Fiction],Science Fiction,,,,,1


## **Split Cast**

Reference: https://practicaldatascience.co.uk/data-science/how-to-split-a-pandas-column-string-or-list-into-separate-columns

1. Transform `cast` attr into a list of cast
2. Binning (?) by order

### **Procedure**

In [34]:
# to avoid error
df[['cast']] = df[['cast']].fillna(value='-')

In [35]:
df['cast_list'] = df['cast'].str.split(', ')
df.head()

Unnamed: 0,show_id,type,title,director,cast,release_year,rating,duration,listed_in,description,duration_int,genre_list,genre_0,genre_1,genre_2,genre_3,genre_4,count_genre,cast_list
0,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,110.0,"[Drama, International]",Drama,International,,,,2,"[Mahesh Manjrekar, Abhay Mahajan, Sachin Khede..."
1,s17,Movie,Zoombies,Glenn Miller,"Marcus Anderson, Kaiwi Lyman, Andrew Asper",2016,13+,87 min,"Horror, Science Fiction",When a strange virus quickly spreads through a...,87.0,"[Horror, Science Fiction]",Horror,Science Fiction,,,,2,"[Marcus Anderson, Kaiwi Lyman, Andrew Asper]"
2,s18,TV Show,Zoo Babies,,Narrator - Gillian Barlett,2008,G,1 Season,"Kids, Special Interest",A heart warming and inspiring series that welc...,1.0,"[Kids, Special Interest]",Kids,Special Interest,,,,2,[Narrator - Gillian Barlett]
3,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,Zoë Coombs Marr,2020,18+,1 Season,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...,1.0,"[Comedy, Talk Show and Variety]",Comedy,Talk Show and Variety,,,,2,[Zoë Coombs Marr]
4,s20,Movie,Zoe,Drake Doremus,"Ewan McGregor, Léa Seydoux, Theo James, Mirand...",2018,R,104 min,Science Fiction,ZOE tells a tale of forbidden love between an ...,104.0,[Science Fiction],Science Fiction,,,,,1,"[Ewan McGregor, Léa Seydoux, Theo James, Miran..."


In [36]:
df_cast = pd.DataFrame(df['cast_list'].tolist()).fillna('').add_prefix('cast_')
df_cast.head()

Unnamed: 0,cast_0,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7,cast_8,cast_9,...,cast_66,cast_67,cast_68,cast_69,cast_70,cast_71,cast_72,cast_73,cast_74,cast_75
0,Mahesh Manjrekar,Abhay Mahajan,Sachin Khedekar,,,,,,,,...,,,,,,,,,,
1,Marcus Anderson,Kaiwi Lyman,Andrew Asper,,,,,,,,...,,,,,,,,,,
2,Narrator - Gillian Barlett,,,,,,,,,,...,,,,,,,,,,
3,Zoë Coombs Marr,,,,,,,,,,...,,,,,,,,,,
4,Ewan McGregor,Léa Seydoux,Theo James,Miranda Otto,Rashida Jones,Christina Aguilera,Matthew Gray Gubler,Anthony Shim,,,...,,,,,,,,,,


In [37]:
row_list = []
for row in df['cast_list'].tolist():
  row_list.append(len(row))

count_cast = pd.Series(row_list)
count_cast

0        3
1        3
2        1
3        1
4        8
        ..
8741     2
8742     5
8743     5
8744    12
8745     4
Length: 8746, dtype: int64

### **Column cut-off**

In [38]:
q1 = count_cast.quantile(q=0.25)
q3 = count_cast.quantile(q=0.75)
iqr_15 = (q3-q1)*1.5
lub = q1-iqr_15
rub = q3+iqr_15
max = count_cast.max()

info_cast = f"""
Q1: {q1}
Q3: {q3}
LUB: {lub}
RUB: {rub}
Max: {max}
"""

print(info_cast)


Q1: 2.0
Q3: 6.0
LUB: -4.0
RUB: 12.0
Max: 76



Refer to RUB

In [39]:
df_cast[df_cast['cast_12'] != ''].shape[0]

440

In [40]:
for i in range(12, 76):
  df_cast = df_cast.drop(f'cast_{i}', axis=1)

df_cast.head()

Unnamed: 0,cast_0,cast_1,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7,cast_8,cast_9,cast_10,cast_11
0,Mahesh Manjrekar,Abhay Mahajan,Sachin Khedekar,,,,,,,,,
1,Marcus Anderson,Kaiwi Lyman,Andrew Asper,,,,,,,,,
2,Narrator - Gillian Barlett,,,,,,,,,,,
3,Zoë Coombs Marr,,,,,,,,,,,
4,Ewan McGregor,Léa Seydoux,Theo James,Miranda Otto,Rashida Jones,Christina Aguilera,Matthew Gray Gubler,Anthony Shim,,,,


In [41]:
df = pd.concat([df, df_cast], axis=1)

In [42]:
df.head()

Unnamed: 0,show_id,type,title,director,cast,release_year,rating,duration,listed_in,description,...,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7,cast_8,cast_9,cast_10,cast_11
0,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,...,Sachin Khedekar,,,,,,,,,
1,s17,Movie,Zoombies,Glenn Miller,"Marcus Anderson, Kaiwi Lyman, Andrew Asper",2016,13+,87 min,"Horror, Science Fiction",When a strange virus quickly spreads through a...,...,Andrew Asper,,,,,,,,,
2,s18,TV Show,Zoo Babies,,Narrator - Gillian Barlett,2008,G,1 Season,"Kids, Special Interest",A heart warming and inspiring series that welc...,...,,,,,,,,,,
3,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,Zoë Coombs Marr,2020,18+,1 Season,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...,...,,,,,,,,,,
4,s20,Movie,Zoe,Drake Doremus,"Ewan McGregor, Léa Seydoux, Theo James, Mirand...",2018,R,104 min,Science Fiction,ZOE tells a tale of forbidden love between an ...,...,Theo James,Miranda Otto,Rashida Jones,Christina Aguilera,Matthew Gray Gubler,Anthony Shim,,,,


## **Clean Up Residu**

### **Drop Temporary Columns**

In [43]:
df = df.drop(['genre_list', 'cast_list'], axis=1)
df.head()

Unnamed: 0,show_id,type,title,director,cast,release_year,rating,duration,listed_in,description,...,cast_2,cast_3,cast_4,cast_5,cast_6,cast_7,cast_8,cast_9,cast_10,cast_11
0,s2,Movie,Take Care Good Night,Girish Joshi,"Mahesh Manjrekar, Abhay Mahajan, Sachin Khedekar",2018,13+,110 min,"Drama, International",A Metro Family decides to fight a Cyber Crimin...,...,Sachin Khedekar,,,,,,,,,
1,s17,Movie,Zoombies,Glenn Miller,"Marcus Anderson, Kaiwi Lyman, Andrew Asper",2016,13+,87 min,"Horror, Science Fiction",When a strange virus quickly spreads through a...,...,Andrew Asper,,,,,,,,,
2,s18,TV Show,Zoo Babies,,Narrator - Gillian Barlett,2008,G,1 Season,"Kids, Special Interest",A heart warming and inspiring series that welc...,...,,,,,,,,,,
3,s19,TV Show,Zoë Coombs Marr: Bossy Bottom,,Zoë Coombs Marr,2020,18+,1 Season,"Comedy, Talk Show and Variety",Zoë Coombs Marr has been on hiatus. Sort of. F...,...,,,,,,,,,,
4,s20,Movie,Zoe,Drake Doremus,"Ewan McGregor, Léa Seydoux, Theo James, Mirand...",2018,R,104 min,Science Fiction,ZOE tells a tale of forbidden love between an ...,...,Theo James,Miranda Otto,Rashida Jones,Christina Aguilera,Matthew Gray Gubler,Anthony Shim,,,,


In [44]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'cast', 'release_year',
       'rating', 'duration', 'listed_in', 'description', 'duration_int',
       'genre_0', 'genre_1', 'genre_2', 'genre_3', 'genre_4', 'count_genre',
       'cast_0', 'cast_1', 'cast_2', 'cast_3', 'cast_4', 'cast_5', 'cast_6',
       'cast_7', 'cast_8', 'cast_9', 'cast_10', 'cast_11'],
      dtype='object')

### **Replace "-" to NaN**

In [45]:
df[['cast', 'cast_0']] = df[['cast', 'cast_0']].replace('-', "")

## **Dataset: df_genre.csv**

### **Create**

In [46]:
columns = ['show_id']
columns

['show_id']

In [47]:
df_genres = concat_df(df, columns, 'genre_', 4)

df_genres.head()

genre_1
genre_2
genre_3
genre_4


Unnamed: 0,show_id,genre_0
0,s2,Drama
1,s17,Horror
2,s18,Kids
3,s19,Comedy
4,s20,Science Fiction


In [48]:
df_genres.shape

(16729, 2)

In [49]:
df_genres = df_genres.rename(columns={"genre_0":"genre", "duration_int":"duration"})
df_genres.head()

Unnamed: 0,show_id,genre
0,s2,Drama
1,s17,Horror
2,s18,Kids
3,s19,Comedy
4,s20,Science Fiction


### **Export**

In [50]:
from google.colab import files
df_genres.to_csv('df_genre.csv')
files.download('df_genre.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## **Dataset: df_cast.csv**

### **Create**

In [51]:
columns = ['show_id']
columns

['show_id']

In [52]:
df_casts = concat_df(df, columns, 'cast_', 11)

df_casts.head()

cast_1
cast_2
cast_3
cast_4
cast_5
cast_6
cast_7
cast_8
cast_9
cast_10
cast_11


Unnamed: 0,show_id,cast_0
0,s2,Mahesh Manjrekar
1,s17,Marcus Anderson
2,s18,Narrator - Gillian Barlett
3,s19,Zoë Coombs Marr
4,s20,Ewan McGregor


In [53]:
df_casts.shape

(38450, 2)

In [54]:
df_casts = df_casts.rename(columns={"listed_in":"genres", "cast_0":"cast"})
df_casts.head()

Unnamed: 0,show_id,cast
0,s2,Mahesh Manjrekar
1,s17,Marcus Anderson
2,s18,Narrator - Gillian Barlett
3,s19,Zoë Coombs Marr
4,s20,Ewan McGregor


### **Export**

In [55]:
from google.colab import files
df_casts.to_csv('df_cast.csv')
files.download('df_cast.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>