## Reading the data

In [6]:
import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")

data = pd.read_csv("../data/All_Streaming_Shows.csv")

In [7]:
data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Genre,Description,No of Seasons,Streaming Platform
0,Breaking Bad,2008,18+,9.5,100,"Crime,Drama","When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix
1,Game of Thrones,2011,18+,9.3,99,"Action & Adventure,Drama",Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO"
2,Rick and Morty,2013,18+,9.2,97,"Animation,Comedy",Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu"
3,Stranger Things,2016,16+,8.8,96,"Drama,Fantasy","When a young boy vanishes, a small town uncove...",3Seasons,Netflix
4,The Boys,2019,18+,8.7,95,"Action & Adventure,Comedy",A group of vigilantes known informally as “The...,2Seasons,Prime Video
...,...,...,...,...,...,...,...,...,...
12348,A Fishing Story with Ronnie Green,2017,,,-1,"2017,Prime Video",A Fishing Story with Ronnie Green has one or m...,2Seasons,"Prime Video,fuboTV"
12349,CMT Most Shocking,2003,,,-1,-1,-1,-1,-1
12350,NHL Road to the Outdoor Classics,2016,,,-1,"2016,Prime Video",Road to the NHL Outdoor Classics takes us deep...,1Season,"Prime Video,Epix"
12351,Addy Media,2018,,,-1,"2018,Prime Video",Addy Media has one or more episodes streaming ...,1Season,Prime Video


Initially we have have over 12k series, with information about them that includes:
- Title
- Year released
- Content rating
- IMBD rating
- R rating
- Genre
- Description
- Number of seasons
- Straeming platforms

We can see that the data will need some thorough cleaning and preparation for further analysis

## Initial data cleaning

In [8]:
# Looking at the types of variables
data.dtypes

Series Title           object
Year Released           int64
Content Rating         object
IMDB Rating           float64
R Rating                int64
Genre                  object
Description            object
No of Seasons          object
Streaming Platform     object
dtype: object

IMDB Rating & R Rating are already numeric variables, so there is no need to change their type

In [9]:
data.describe(include = object)

Unnamed: 0,Series Title,Content Rating,Genre,Description,No of Seasons,Streaming Platform
count,12353,7232,12353,12353,12353,10370
unique,12109,5,858,11875,94,530
top,Kingdom,16+,-1,-1,1Season,Netflix
freq,4,2581,479,479,5204,1427


We see that there are missing values in Content Rating and IMBD Rating (`NaN`). Additionally we have `-1` values in R Rating and Genre columns that probably are also missing values.

In [10]:
# Identifying missing values
empty_data = data.isnull().any(axis = 1)
null_rows = data[empty_data]
print(null_rows)

                            Series Title  Year Released Content Rating  \
186                               Castle           2009            16+   
250                             Gomorrah           2014            18+   
337                       Masters of Sex           2013            18+   
345                                Louie           2010            18+   
350                            Continuum           2012            16+   
...                                  ...            ...            ...   
12348  A Fishing Story with Ronnie Green           2017            NaN   
12349                  CMT Most Shocking           2003            NaN   
12350   NHL Road to the Outdoor Classics           2016            NaN   
12351                         Addy Media           2018            NaN   
12352             My Dream Derelict Home           2014            NaN   

       IMDB Rating  R Rating                     Genre  \
186            8.1        83              Comedy,Crim

In [11]:
data['R Rating'].unique()

array([100,  99,  97,  96,  95,  94,  93,  92,  91,  90,  89,  88,  87,
        86,  85,  84,  83,  82,  81,  80,  79,  78,  77,  76,  75,  74,
        73,  72,  71,  70,  69,  68,  67,  66,  65,  64,  63,  62,  61,
        60,  59,  58,  57,  56,  55,  54,  53,  52,  51,  50,  49,  48,
        47,  46,  45,  44,  43,  42,  41,  40,  39,  38,  37,  36,  35,
        34,  33,  32,  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,
        21,  20,  19,  18,  17,  16,  15,  13,  10,  -1])

In [12]:
# Identifying numer of rows with normal values
len(data[(data["Genre"] != "-1") & (data["R Rating"] != -1) & (data["Description"] != "-1") & (data["Streaming Platform"])])


9371

In [13]:
# We filter out "-1" values from the dataset
data = data[(data["Genre"] != "-1") & (data["R Rating"] != -1) & (data["Description"] != "-1") & (data["Streaming Platform"])]

In [14]:
# We replace empty values in "IMDB Rating" with their mode values based on the R Rating score
data["IMDB Rating"] = data.groupby("R Rating")["IMDB Rating"].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

In [15]:
# We drop NaN values from "IMDB Rating"
data.dropna(subset = ["IMDB Rating"], inplace = True)

In [16]:
data["Genre"].unique()

array(['Crime,Drama', 'Action & Adventure,Drama', 'Animation,Comedy',
       'Drama,Fantasy', 'Action & Adventure,Comedy', 'Drama,History',
       'Action & Adventure,Crime', 'Action & Adventure,Animation',
       'Comedy,Romance', 'Comedy,2005', 'Comedy,Crime',
       'Action & Adventure,Anime', 'Drama,Science-Fiction',
       'Drama,Mystery', 'Fantasy,Crime', 'Thriller,Action & Adventure',
       'Drama,Thriller', 'Action & Adventure,Science-Fiction',
       'Drama,Horror', 'Comedy,LGBTQ', 'Comedy,2009', 'Drama,2007',
       'Comedy,Drama', 'Drama,Comedy', 'Documentary,Biography',
       'Comedy,Stand-up & Talk', 'Horror,Drama',
       'Drama,Action & Adventure', 'Drama,Romance', 'Drama,Sport',
       'Thriller,Biography', 'Science-Fiction,Animation', 'Comedy,2003',
       'Action & Adventure,Thriller', 'Thriller,LGBTQ', 'Comedy,2014',
       'Drama,2018', 'Drama,LGBTQ', 'Drama,Crime',
       'Action & Adventure,Fantasy', 'Comedy,2000', 'Biography,Drama',
       'Action & Adventure,H

In [17]:
# We make a set of unique streaming platforms
all_platforms = []
for i in data["Streaming Platform"]:
    a = i.split(",")
    all_platforms.extend(a)

all_platforms = set(all_platforms)
print(all_platforms)


{'TVLand', 'TruTV', 'DIY', 'TLC', 'Disney', 'Nick', 'IFC', 'NBC', 'FYI', 'IndieFlix', 'AMC', 'Netflix', 'NatGeo', 'TNT', 'Hoopla', 'Shudder', 'Peacock Premium', 'Cartoon Network', 'Food Network', 'Crunchyroll', 'BET', 'Cinemax', 'VH1', 'CNBC', 'Starz', 'Funimation', 'Viceland', 'Comedy Central', 'BBC America', 'BritBox', 'AMC Premiere', 'Syfy', 'Hallmark', 'CBS All Access', 'HGTV', 'Hallmark Movies Now', 'HBO', 'TBS', 'Showtime', 'Epix', 'A&E', 'Travel Channel', 'USA', 'YouTube Premium', 'fuboTV', 'Apple TV+', 'Disney+', 'History', 'DC Universe', 'Adult Swim', 'Science', 'ABC', 'HBO MAX', 'Free Services', 'FX', 'FOX', 'Sundance', 'BET+', 'MTV', 'Prime Video', 'Lifetime', 'Hulu', 'Bravo', 'AcornTV'}


In [18]:
# Cleaning "Genre" column & creating dummy variables to replace different genres
for i in all_platforms:
    data["Genre"] = data["Genre"].str.replace(i,"")

data["Genre"] = data["Genre"].str.replace(r'\b\d{4}\b', '', regex=True)
data["Genre"] = data["Genre"].str.replace(r'\+', '', regex=True)
data["Genre"] = data["Genre"].apply(lambda x:"Unknown" if x == "-1" else x)
data["Genre"] = data["Genre"].str.replace(r'^,|(?<=,),$|,$', '', regex=True)
data["Genre"] = data["Genre"].str.strip()
data["Genre"] = data["Genre"].apply(lambda x:"Unknown" if x.strip() == "" else x)

data = data.join(
    data["Genre"].str.get_dummies(sep=',')
)

data = data.drop(columns = ["Genre"])
data = data.rename(columns={'-Fiction': 'Fiction'})
data.head()

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,MAX,Musical,Mystery,Reality,Romance,Sport,Stand-up & Talk,Thriller,Travel,Unknown
0,Breaking Bad,2008,18+,9.5,100,"When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Game of Thrones,2011,18+,9.3,99,Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO",0,1,...,0,0,0,0,0,0,0,0,0,0
2,Rick and Morty,2013,18+,9.2,97,Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu",0,0,...,0,0,0,0,0,0,0,0,0,0
3,Stranger Things,2016,16+,8.8,96,"When a young boy vanishes, a small town uncove...",3Seasons,Netflix,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Boys,2019,18+,8.7,95,A group of vigilantes known informally as “The...,2Seasons,Prime Video,0,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
data.columns

Index(['Series Title', 'Year Released', 'Content Rating', 'IMDB Rating',
       'R Rating', 'Description', 'No of Seasons', 'Streaming Platform',
       'Fiction', 'Action & Adventure', 'Animation', 'Anime', 'Biography',
       'Children', 'Comedy', 'Crime', 'Cult', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Food', 'Game Show', 'Home & Garden', 'Horror', 'LGBTQ',
       'MAX', 'Musical', 'Mystery', 'Reality', 'Romance', 'Sport',
       'Stand-up & Talk', 'Thriller', 'Travel', 'Unknown'],
      dtype='object')

In [20]:
for col in data.columns[8:]:
    print(f"Distribution in {col}:")
    print(data[col].value_counts())
    print("\n")

Distribution in Fiction:
Fiction
0    8587
1     208
Name: count, dtype: int64


Distribution in Action & Adventure:
Action & Adventure
0    7341
1    1454
Name: count, dtype: int64


Distribution in Animation:
Animation
0    7069
1    1726
Name: count, dtype: int64


Distribution in Anime:
Anime
0    8145
1     650
Name: count, dtype: int64


Distribution in Biography:
Biography
0    8687
1     108
Name: count, dtype: int64


Distribution in Children:
Children
0    8402
1     393
Name: count, dtype: int64


Distribution in Comedy:
Comedy
0    6794
1    2001
Name: count, dtype: int64


Distribution in Crime:
Crime
0    7877
1     918
Name: count, dtype: int64


Distribution in Cult:
Cult
0    8785
1      10
Name: count, dtype: int64


Distribution in Documentary:
Documentary
0    7262
1    1533
Name: count, dtype: int64


Distribution in Drama:
Drama
0    6440
1    2355
Name: count, dtype: int64


Distribution in Family:
Family
0    8370
1     425
Name: count, dtype: int64


Distributi

In [21]:
# Content Rating variable
data["Content Rating"].unique()

array(['18+', '16+', '7+', 'all', nan, '13+'], dtype=object)

In [22]:
data["Content Rating"] = data["Content Rating"].str.replace("+", "")
data["Content Rating"] = data["Content Rating"]=data["Content Rating"].transform(lambda x: x.fillna("0"))
data["Content Rating"] = data["Content Rating"].str.replace("all", "0")
data["Content Rating"] = data["Content Rating"].apply(lambda x:"R Rated" if x == "18" else "Not R Rated")

In [23]:
dummies = pd.get_dummies(data["Content Rating"])
data = pd.concat([data, pd.get_dummies(data["Content Rating"], prefix = "Content_Rating")], axis = 1)
data.columns

Index(['Series Title', 'Year Released', 'Content Rating', 'IMDB Rating',
       'R Rating', 'Description', 'No of Seasons', 'Streaming Platform',
       'Fiction', 'Action & Adventure', 'Animation', 'Anime', 'Biography',
       'Children', 'Comedy', 'Crime', 'Cult', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Food', 'Game Show', 'Home & Garden', 'Horror', 'LGBTQ',
       'MAX', 'Musical', 'Mystery', 'Reality', 'Romance', 'Sport',
       'Stand-up & Talk', 'Thriller', 'Travel', 'Unknown',
       'Content_Rating_Not R Rated', 'Content_Rating_R Rated'],
      dtype='object')

In [24]:
# Seasons variable
data["No of Seasons"].unique()

array(['5Seasons', '8Seasons', '4Seasons', '3Seasons', '2Seasons',
       '1Season', '10Seasons', '9Seasons', '15 Seasons', '1 Season',
       '5 Seasons', '31 Seasons', '6Seasons', '6 Seasons', '7 Seasons',
       '23 Seasons', '7Seasons', '10 Seasons', '12Seasons', '11 Seasons',
       '14Seasons', '3 Seasons', '4 Seasons', '16 Seasons', '11Seasons',
       '2 Seasons', '9 Seasons', '8 Seasons', '24 Seasons', '19 Seasons',
       '18 Seasons', '17 Seasons', '12 Seasons', '21 Seasons',
       '16Seasons', '28 Seasons', '13Seasons', '29 Seasons', '45 Seasons',
       '21Seasons', '27 Seasons', '25 Seasons', '40 Seasons', '20Seasons',
       '14 Seasons', '51 Seasons', '13 Seasons', '15Seasons',
       '20 Seasons', '24Seasons', '62 Seasons', '35 Seasons',
       '44 Seasons', '18Seasons', '17Seasons', '75 Seasons', '54 Seasons',
       '27Seasons', '22 Seasons', '32Seasons', '187 Seasons', '31Seasons',
       '36 Seasons', '26 Seasons', '32 Seasons', '52 Seasons',
       '34 Seasons', 

In [25]:
data["No of Seasons"] = data["No of Seasons"].str.replace(r"\D", "", regex=True)
data["No of Seasons"] = pd.to_numeric(data["No of Seasons"])

In [26]:
data["No of Seasons"].unique()

array([  5,   8,   4,   3,   2,   1,  10,   9,  15,  31,   6,   7,  23,
        12,  11,  14,  16,  24,  19,  18,  17,  21,  28,  13,  29,  45,
        27,  25,  40,  20,  51,  62,  35,  44,  75,  54,  22,  32, 187,
        36,  26,  52,  34,  37,  77,  41,  42,  60,  33, 160,  38,  84,
        39,  43,  48,  71,  82,  57])

In [27]:
# Streaming Platform
data["Streaming Platform"].unique()
data["Platform Count"] = data["Streaming Platform"].apply(lambda x: len(x.split(", ")))


In [28]:
from scipy import stats

# Spearman Correlation
# The correlation is small, we can keep both variables
correlation = stats.spearmanr(data["Reality"], data["Documentary"])
print(correlation)

SignificanceResult(statistic=np.float64(0.09808792944352131), pvalue=np.float64(2.9770847435335363e-20))


In [29]:
# Deleting duplicates
data = data.drop_duplicates()

In [30]:
# No empty values
data.isnull().values.any()

np.False_

In [31]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Combine built-in and custom stop words
custom_stop_words = set([
    "movie", "about", "with", "will", "that", "leave", "episodes",
    "imdb", "season", "seasons", "hulu", "prime"
])
stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS).union(custom_stop_words)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/monikakot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [32]:
# Clean and tokenize function
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return [word for word in tokens if word not in stop_words]

# N-gram extraction function
def extract_ngrams(df, n):
    all_ngrams = []

    for description in df['Description'].dropna():
        tokens = clean_and_tokenize(description)
        n_gram_list = ngrams(tokens, n)
        filtered = [
            ' '.join(gram) for gram in n_gram_list
            if all(word not in stop_words for word in gram)
        ]
        all_ngrams.extend(filtered)

    # Count frequency
    ngram_counts = Counter(all_ngrams)

    # Convert to DataFrame
    ngram_df = pd.DataFrame(ngram_counts.items(), columns=['ngram', 'count'])
    ngram_df = ngram_df.sort_values(by='count', ascending=False).reset_index(drop=True)

    return ngram_df

In [33]:
import regex as re
from nltk.util import ngrams
from collections import Counter

# Extract and count bigrams
bigrams = extract_ngrams(data, 2)
print("Most common bigrams:")
print(bigrams.head(10))

Most common bigrams:
                    ngram  count
0         audience rating   8291
1            rating votes   8257
2  streaming subscription   8220
3        average audience   5189
4      available purchase   5088
5                 air new   4703
6               plans air   4701
7           running plans   4701
8          longer running   4701
9            date episode   3940


In [34]:
# Extract and count trigrams
trigrams = extract_ngrams(data, 3)
print("\nMost common trigrams:")
print(trigrams.head(10))


Most common trigrams:
                       ngram  count
0      audience rating votes   8257
1    average audience rating   5189
2              plans air new   4701
3          running plans air   4701
4       longer running plans   4701
5     announced date episode   3940
6      airing announced date   3940
7       high audience rating   2867
8    better average audience   2836
9  available purchase itunes   2375


In [35]:
# Extract and count 4-grams
fourgrams = extract_ngrams(data, 4)
print("\nMost common 4-grams:")
print(fourgrams.head(10))


Most common 4-grams:
                            ngram  count
0   average audience rating votes   5167
1           running plans air new   4701
2        longer running plans air   4701
3   airing announced date episode   3940
4      high audience rating votes   2856
5  better average audience rating   2836
6        new high audience rating   1705
7              plans air new high   1705
8           air new high audience   1705
9          air new better average   1695


In [36]:
nltk.download('stopwords')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Tokenize and filter
def tokenize_and_filter(text):
    words = word_tokenize(text.lower())
    filtered_words = [
        word for word in words
        if word.isalpha()
        and word not in stop_words
        and word not in custom_stop_words
    ]
    return filtered_words

# Apply tokenization
data['keywords'] = data['Description'].apply(tokenize_and_filter)

# If you want one word per row (like unnest_tokens)
df_keywords = data.explode('keywords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/monikakot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/monikakot/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [37]:
word_frequencies = df_keywords['keywords'].value_counts().reset_index()
word_frequencies.columns = ['word', 'count']
print(word_frequencies.head(10))

           word  count
0     streaming  10774
1      audience   8351
2        rating   8305
3         votes   8260
4  subscription   8226
5           new   6356
6     featuring   6166
7       average   5241
8     available   5152
9      purchase   5104


In [38]:
# List of common phrases (0-indexed in Python)
common_phrases = [
    "true crime", "food network", "world war", "award winning",
    "serial killer", "emmy award", "featuring", "family", "love"
]

# Ensure Description is ASCII-only (similar to iconv)
data['Description'] = data['Description'].str.encode('ascii', errors='ignore').str.decode('ascii')

# Add new columns based on phrase matches (case-insensitive)
data['true_crime'] = data['Description'].str.contains(common_phrases[0], case=False, na=False).astype(int)
data['world_war'] = data['Description'].str.contains(common_phrases[2], case=False, na=False).astype(int)
data['award_winning'] = data['Description'].str.contains(common_phrases[3], case=False, na=False).astype(int)
data['serial_killer'] = data['Description'].str.contains(common_phrases[4], case=False, na=False).astype(int)
data['emmy_award'] = data['Description'].str.contains(common_phrases[5], case=False, na=False).astype(int)
data['featuring'] = data['Description'].str.contains(common_phrases[6], case=False, na=False).astype(int)
data['love'] = data['Description'].str.contains(common_phrases[8], case=False, na=False).astype(int)

print(data.columns)

# Ensure your columns are correctly formatted and renamed
data['R_Rating'] = pd.to_numeric(data['R Rating'], errors='coerce')
data['Genre_GameShow'] = data['Game Show']
data['Genre_Animation'] = data['Animation']
data['Genre_Children'] = data['Children']
data['Genre_Crime'] = data['Crime']
data['Genre_Drama'] = data['Drama']
data['Genre_Anime'] = data['Anime']
data['Genre_Comedy'] = data['Comedy']
data['Genre_Documentary'] = data['Documentary']
data['Genre_Reality'] = data['Reality']
data['Genre_Fiction'] = data['Fiction']
data['Genre_ActionAdventure'] = data['Action & Adventure']
data['Genre_HomeGarden'] = data['Home & Garden']
data['Genre_StandupTalk'] = data['Stand-up & Talk']
data['R_Rated'] = 'R Rated'

# Create ordinal IMDB rating by rounding and converting to categorical
data['ordinal_IMDBRating'] = pd.Categorical(data['IMDB Rating'].round().astype(int), ordered=True)


Index(['Series Title', 'Year Released', 'Content Rating', 'IMDB Rating',
       'R Rating', 'Description', 'No of Seasons', 'Streaming Platform',
       'Fiction', 'Action & Adventure', 'Animation', 'Anime', 'Biography',
       'Children', 'Comedy', 'Crime', 'Cult', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Food', 'Game Show', 'Home & Garden', 'Horror', 'LGBTQ',
       'MAX', 'Musical', 'Mystery', 'Reality', 'Romance', 'Sport',
       'Stand-up & Talk', 'Thriller', 'Travel', 'Unknown',
       'Content_Rating_Not R Rated', 'Content_Rating_R Rated',
       'Platform Count', 'keywords', 'true_crime', 'world_war',
       'award_winning', 'serial_killer', 'emmy_award', 'featuring', 'love'],
      dtype='object')


## Further data preparation

In [39]:
data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,Genre_Anime,Genre_Comedy,Genre_Documentary,Genre_Reality,Genre_Fiction,Genre_ActionAdventure,Genre_HomeGarden,Genre_StandupTalk,R_Rated,ordinal_IMDBRating
0,Breaking Bad,2008,R Rated,9.5,100,"When Walter White, a New Mexico chemistry teac...",5,Netflix,0,0,...,0,0,0,0,0,0,0,0,R Rated,10
1,Game of Thrones,2011,R Rated,9.3,99,Seven noble families fight for control of the ...,8,"HBO MAX,HBO",0,1,...,0,0,0,0,0,1,0,0,R Rated,9
2,Rick and Morty,2013,R Rated,9.2,97,Rick is a mentally-unbalanced but scientifical...,4,"Free Services,HBO MAX,Hulu",0,0,...,0,1,0,0,0,0,0,0,R Rated,9
3,Stranger Things,2016,Not R Rated,8.8,96,"When a young boy vanishes, a small town uncove...",3,Netflix,0,0,...,0,0,0,0,0,0,0,0,R Rated,9
4,The Boys,2019,R Rated,8.7,95,A group of vigilantes known informally as The ...,2,Prime Video,0,1,...,0,1,0,0,0,1,0,0,R Rated,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,2016,Not R Rated,3.3,17,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,1,Funimation,1,0,...,0,0,0,0,1,0,0,0,R Rated,3
10797,Mrs. Fitzgerald Is Missing,2018,Not R Rated,3.3,17,Mrs. Fitzgerald Is Missing has one or more epi...,2,Prime Video,0,0,...,0,0,0,0,0,0,0,0,R Rated,3
10798,The Edinburgh Show,2019,Not R Rated,3.3,17,Reports on all the hottest shows at the Edinbu...,1,BritBox,0,0,...,0,0,0,0,0,0,0,0,R Rated,3
10800,Coming to the Stage,2003,Not R Rated,3.3,17,Amateur comedians strut their stand-up stuff b...,6,"Free Services,Hulu",0,0,...,0,1,0,0,0,0,0,1,R Rated,3


In [40]:
data.columns

Index(['Series Title', 'Year Released', 'Content Rating', 'IMDB Rating',
       'R Rating', 'Description', 'No of Seasons', 'Streaming Platform',
       'Fiction', 'Action & Adventure', 'Animation', 'Anime', 'Biography',
       'Children', 'Comedy', 'Crime', 'Cult', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Food', 'Game Show', 'Home & Garden', 'Horror', 'LGBTQ',
       'MAX', 'Musical', 'Mystery', 'Reality', 'Romance', 'Sport',
       'Stand-up & Talk', 'Thriller', 'Travel', 'Unknown',
       'Content_Rating_Not R Rated', 'Content_Rating_R Rated',
       'Platform Count', 'keywords', 'true_crime', 'world_war',
       'award_winning', 'serial_killer', 'emmy_award', 'featuring', 'love',
       'R_Rating', 'Genre_GameShow', 'Genre_Animation', 'Genre_Children',
       'Genre_Crime', 'Genre_Drama', 'Genre_Anime', 'Genre_Comedy',
       'Genre_Documentary', 'Genre_Reality', 'Genre_Fiction',
       'Genre_ActionAdventure', 'Genre_HomeGarden', 'Genre_StandupTalk',
       'R_Rated', 

### IMBD Rating

In [41]:
# Let's look closer at the modelled variable
data['IMDB Rating'].sort_values().unique()

array([1. , 1.2, 1.3, 1.6, 1.8, 1.9, 2. , 2.1, 2.2, 2.3, 2.4, 2.5, 2.6,
       2.7, 2.8, 2.9, 3. , 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9,
       4. , 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9, 5. , 5.1, 5.2,
       5.3, 5.4, 5.5, 5.6, 5.7, 5.8, 5.9, 6. , 6.1, 6.2, 6.3, 6.4, 6.5,
       6.6, 6.7, 6.8, 6.9, 7. , 7.1, 7.2, 7.3, 7.4, 7.5, 7.6, 7.7, 7.8,
       7.9, 8. , 8.1, 8.2, 8.3, 8.4, 8.5, 8.6, 8.7, 8.8, 8.9, 9. , 9.1,
       9.2, 9.3, 9.4, 9.5, 9.7])

### Year of production

In [42]:
# What are the years of production?
data['Year Released'].describe()

count    8795.000000
mean     2010.356907
std        11.658913
min      1901.000000
25%      2008.000000
50%      2014.000000
75%      2017.000000
max      2020.000000
Name: Year Released, dtype: float64

In [43]:
data['Year Released'].sort_values() 

10423    1901
10731    1901
10569    1904
3608     1922
9572     1931
         ... 
1805     2020
3862     2020
3843     2020
3837     2020
6375     2020
Name: Year Released, Length: 8795, dtype: int64

Year 1901 looks suspicious..

In [44]:
data[data['Year Released'] <= 1931].sort_values(by = 'Year Released')

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,Genre_Anime,Genre_Comedy,Genre_Documentary,Genre_Reality,Genre_Fiction,Genre_ActionAdventure,Genre_HomeGarden,Genre_StandupTalk,R_Rated,ordinal_IMDBRating
10423,Gods & Monsters with Tony Robinson,1901,Not R Rated,5.5,23,Gods & Monsters with Tony Robinson has one or ...,1,"Free Services,Prime Video",0,0,...,0,0,0,0,0,0,0,0,R Rated,6
10731,Space: The New Frontier,1901,Not R Rated,1.2,18,Space: The New Frontier has one or more episod...,4,"Free Services,Prime Video",0,0,...,0,0,0,0,0,0,0,0,R Rated,1
10569,History of Westinghouse,1904,Not R Rated,5.9,21,History of Westinghouse has one or more episod...,1,Prime Video,0,0,...,0,0,0,0,0,0,0,0,R Rated,6
3608,Our Gang,1922,Not R Rated,8.0,56,Our Gang is a series of American comedy short ...,22,Free Services,0,0,...,0,1,0,0,0,0,0,0,R Rated,8
9572,The Little Rascals Classics,1931,Not R Rated,6.4,31,The Little Rascals were the first in a long li...,1,Hulu,0,0,...,0,0,0,0,0,0,0,0,R Rated,6


After investigating, The series from 1901 and 1904 are not properly flagged - it's possible that at the time of collection they year was not known yet. We are dropping those.

In [45]:
data = data[~(data['Year Released'] <= 1904)]

In [46]:
# Creating a variable that would indicate the age of production at the time of data collection

# Maximum year of production
max_year = data['Year Released'].max()

# Creating the 'Age of Series' column
data['age_of_series'] = max_year - data['Year Released']
data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,Genre_Comedy,Genre_Documentary,Genre_Reality,Genre_Fiction,Genre_ActionAdventure,Genre_HomeGarden,Genre_StandupTalk,R_Rated,ordinal_IMDBRating,age_of_series
0,Breaking Bad,2008,R Rated,9.5,100,"When Walter White, a New Mexico chemistry teac...",5,Netflix,0,0,...,0,0,0,0,0,0,0,R Rated,10,12
1,Game of Thrones,2011,R Rated,9.3,99,Seven noble families fight for control of the ...,8,"HBO MAX,HBO",0,1,...,0,0,0,0,1,0,0,R Rated,9,9
2,Rick and Morty,2013,R Rated,9.2,97,Rick is a mentally-unbalanced but scientifical...,4,"Free Services,HBO MAX,Hulu",0,0,...,1,0,0,0,0,0,0,R Rated,9,7
3,Stranger Things,2016,Not R Rated,8.8,96,"When a young boy vanishes, a small town uncove...",3,Netflix,0,0,...,0,0,0,0,0,0,0,R Rated,9,4
4,The Boys,2019,R Rated,8.7,95,A group of vigilantes known informally as The ...,2,Prime Video,0,1,...,1,0,0,0,1,0,0,R Rated,9,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,2016,Not R Rated,3.3,17,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,1,Funimation,1,0,...,0,0,0,1,0,0,0,R Rated,3,4
10797,Mrs. Fitzgerald Is Missing,2018,Not R Rated,3.3,17,Mrs. Fitzgerald Is Missing has one or more epi...,2,Prime Video,0,0,...,0,0,0,0,0,0,0,R Rated,3,2
10798,The Edinburgh Show,2019,Not R Rated,3.3,17,Reports on all the hottest shows at the Edinbu...,1,BritBox,0,0,...,0,0,0,0,0,0,0,R Rated,3,1
10800,Coming to the Stage,2003,Not R Rated,3.3,17,Amateur comedians strut their stand-up stuff b...,6,"Free Services,Hulu",0,0,...,1,0,0,0,0,0,1,R Rated,3,17


### Content Rating

In [47]:
data['Content Rating'].unique()

array(['R Rated', 'Not R Rated'], dtype=object)

In [48]:
data['r_rated'] = data['Content Rating'].map({'R Rated': 1, 'Not R Rated': 0})
data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,Genre_Documentary,Genre_Reality,Genre_Fiction,Genre_ActionAdventure,Genre_HomeGarden,Genre_StandupTalk,R_Rated,ordinal_IMDBRating,age_of_series,r_rated
0,Breaking Bad,2008,R Rated,9.5,100,"When Walter White, a New Mexico chemistry teac...",5,Netflix,0,0,...,0,0,0,0,0,0,R Rated,10,12,1
1,Game of Thrones,2011,R Rated,9.3,99,Seven noble families fight for control of the ...,8,"HBO MAX,HBO",0,1,...,0,0,0,1,0,0,R Rated,9,9,1
2,Rick and Morty,2013,R Rated,9.2,97,Rick is a mentally-unbalanced but scientifical...,4,"Free Services,HBO MAX,Hulu",0,0,...,0,0,0,0,0,0,R Rated,9,7,1
3,Stranger Things,2016,Not R Rated,8.8,96,"When a young boy vanishes, a small town uncove...",3,Netflix,0,0,...,0,0,0,0,0,0,R Rated,9,4,0
4,The Boys,2019,R Rated,8.7,95,A group of vigilantes known informally as The ...,2,Prime Video,0,1,...,0,0,0,1,0,0,R Rated,9,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,2016,Not R Rated,3.3,17,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,1,Funimation,1,0,...,0,0,1,0,0,0,R Rated,3,4,0
10797,Mrs. Fitzgerald Is Missing,2018,Not R Rated,3.3,17,Mrs. Fitzgerald Is Missing has one or more epi...,2,Prime Video,0,0,...,0,0,0,0,0,0,R Rated,3,2,0
10798,The Edinburgh Show,2019,Not R Rated,3.3,17,Reports on all the hottest shows at the Edinbu...,1,BritBox,0,0,...,0,0,0,0,0,0,R Rated,3,1,0
10800,Coming to the Stage,2003,Not R Rated,3.3,17,Amateur comedians strut their stand-up stuff b...,6,"Free Services,Hulu",0,0,...,0,0,0,0,0,1,R Rated,3,17,0


### R rating

Using this variable might be considered data leakage as it is directly correlated with the IMBD rating. Not using it for now.

### Description

We extracted the genres from the description, we can additionaly extract the sentiment score to get some understanding of the overall atmosphere of the series.

In [49]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

data['description_sentiment'] = data['Description'].apply(lambda x: analyzer.polarity_scores(x))
data[['description_sentiment', 'Description']]

Unnamed: 0,description_sentiment,Description
0,"{'neg': 0.156, 'neu': 0.768, 'pos': 0.076, 'co...","When Walter White, a New Mexico chemistry teac..."
1,"{'neg': 0.179, 'neu': 0.773, 'pos': 0.048, 'co...",Seven noble families fight for control of the ...
2,"{'neg': 0.112, 'neu': 0.829, 'pos': 0.058, 'co...",Rick is a mentally-unbalanced but scientifical...
3,"{'neg': 0.105, 'neu': 0.846, 'pos': 0.05, 'com...","When a young boy vanishes, a small town uncove..."
4,"{'neg': 0.066, 'neu': 0.845, 'pos': 0.088, 'co...",A group of vigilantes known informally as The ...
...,...,...
10796,"{'neg': 0.09, 'neu': 0.877, 'pos': 0.033, 'com...",Time Travel Shoujo: Mari Waka to 8-nin no Kaga...
10797,"{'neg': 0.133, 'neu': 0.867, 'pos': 0.0, 'comp...",Mrs. Fitzgerald Is Missing has one or more epi...
10798,"{'neg': 0.045, 'neu': 0.955, 'pos': 0.0, 'comp...",Reports on all the hottest shows at the Edinbu...
10800,"{'neg': 0.053, 'neu': 0.81, 'pos': 0.137, 'com...",Amateur comedians strut their stand-up stuff b...


In [50]:
data['Description'][0]

"When Walter White, a New Mexico chemistry teacher, is diagnosed with Stage III cancer and given a prognosis of only two years left to live. He becomes filled with a sense of fearlessness and an unrelenting desire to secure his family's financial future at any cost as he enters the dangerous world of drugs and crime.Breaking Bad featuring Bryan Cranston and Aaron Paul has one or more episodes streaming with subscription on Netflix, available for purchase on iTunes, available for purchase on Google Play, and 3 others. It's a crime and drama show with 62 episodes over 5 seasons. Breaking Bad is no longer running and has no plans to air new episodes or seasons. It has a very high IMDb audience rating of 9.5 (1,391,409 votes) and was very well received by critics."

In [51]:
data['description_sentiment'][0]

{'neg': 0.156, 'neu': 0.768, 'pos': 0.076, 'compound': -0.9275}

In [52]:
# Extracting the compound (overall sentiment)
data['sentiment'] = data['description_sentiment'].apply(lambda x: x['compound'])
data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,Genre_Fiction,Genre_ActionAdventure,Genre_HomeGarden,Genre_StandupTalk,R_Rated,ordinal_IMDBRating,age_of_series,r_rated,description_sentiment,sentiment
0,Breaking Bad,2008,R Rated,9.5,100,"When Walter White, a New Mexico chemistry teac...",5,Netflix,0,0,...,0,0,0,0,R Rated,10,12,1,"{'neg': 0.156, 'neu': 0.768, 'pos': 0.076, 'co...",-0.9275
1,Game of Thrones,2011,R Rated,9.3,99,Seven noble families fight for control of the ...,8,"HBO MAX,HBO",0,1,...,0,1,0,0,R Rated,9,9,1,"{'neg': 0.179, 'neu': 0.773, 'pos': 0.048, 'co...",-0.9686
2,Rick and Morty,2013,R Rated,9.2,97,Rick is a mentally-unbalanced but scientifical...,4,"Free Services,HBO MAX,Hulu",0,0,...,0,0,0,0,R Rated,9,7,1,"{'neg': 0.112, 'neu': 0.829, 'pos': 0.058, 'co...",-0.8577
3,Stranger Things,2016,Not R Rated,8.8,96,"When a young boy vanishes, a small town uncove...",3,Netflix,0,0,...,0,0,0,0,R Rated,9,4,0,"{'neg': 0.105, 'neu': 0.846, 'pos': 0.05, 'com...",-0.6378
4,The Boys,2019,R Rated,8.7,95,A group of vigilantes known informally as The ...,2,Prime Video,0,1,...,0,1,0,0,R Rated,9,1,1,"{'neg': 0.066, 'neu': 0.845, 'pos': 0.088, 'co...",0.3167
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,2016,Not R Rated,3.3,17,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,1,Funimation,1,0,...,1,0,0,0,R Rated,3,4,0,"{'neg': 0.09, 'neu': 0.877, 'pos': 0.033, 'com...",-0.4939
10797,Mrs. Fitzgerald Is Missing,2018,Not R Rated,3.3,17,Mrs. Fitzgerald Is Missing has one or more epi...,2,Prime Video,0,0,...,0,0,0,0,R Rated,3,2,0,"{'neg': 0.133, 'neu': 0.867, 'pos': 0.0, 'comp...",-0.6808
10798,The Edinburgh Show,2019,Not R Rated,3.3,17,Reports on all the hottest shows at the Edinbu...,1,BritBox,0,0,...,0,0,0,0,R Rated,3,1,0,"{'neg': 0.045, 'neu': 0.955, 'pos': 0.0, 'comp...",-0.2960
10800,Coming to the Stage,2003,Not R Rated,3.3,17,Amateur comedians strut their stand-up stuff b...,6,"Free Services,Hulu",0,0,...,0,0,0,1,R Rated,3,17,0,"{'neg': 0.053, 'neu': 0.81, 'pos': 0.137, 'com...",0.7845


In [53]:
data['sentiment'].describe()

count    8792.000000
mean        0.199942
std         0.670078
min        -0.996100
25%        -0.421500
50%         0.401900
75%         0.827100
max         0.997800
Name: sentiment, dtype: float64

### Streaming platform

In here, we can try to to get infomation about the number of platforms given sereis is streamed on and the binary variable for most popular ones

In [54]:
data['number_of_platforms'] = data['Streaming Platform'].apply(lambda x: len(x.split(',')))
data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,Genre_ActionAdventure,Genre_HomeGarden,Genre_StandupTalk,R_Rated,ordinal_IMDBRating,age_of_series,r_rated,description_sentiment,sentiment,number_of_platforms
0,Breaking Bad,2008,R Rated,9.5,100,"When Walter White, a New Mexico chemistry teac...",5,Netflix,0,0,...,0,0,0,R Rated,10,12,1,"{'neg': 0.156, 'neu': 0.768, 'pos': 0.076, 'co...",-0.9275,1
1,Game of Thrones,2011,R Rated,9.3,99,Seven noble families fight for control of the ...,8,"HBO MAX,HBO",0,1,...,1,0,0,R Rated,9,9,1,"{'neg': 0.179, 'neu': 0.773, 'pos': 0.048, 'co...",-0.9686,2
2,Rick and Morty,2013,R Rated,9.2,97,Rick is a mentally-unbalanced but scientifical...,4,"Free Services,HBO MAX,Hulu",0,0,...,0,0,0,R Rated,9,7,1,"{'neg': 0.112, 'neu': 0.829, 'pos': 0.058, 'co...",-0.8577,3
3,Stranger Things,2016,Not R Rated,8.8,96,"When a young boy vanishes, a small town uncove...",3,Netflix,0,0,...,0,0,0,R Rated,9,4,0,"{'neg': 0.105, 'neu': 0.846, 'pos': 0.05, 'com...",-0.6378,1
4,The Boys,2019,R Rated,8.7,95,A group of vigilantes known informally as The ...,2,Prime Video,0,1,...,1,0,0,R Rated,9,1,1,"{'neg': 0.066, 'neu': 0.845, 'pos': 0.088, 'co...",0.3167,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,2016,Not R Rated,3.3,17,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,1,Funimation,1,0,...,0,0,0,R Rated,3,4,0,"{'neg': 0.09, 'neu': 0.877, 'pos': 0.033, 'com...",-0.4939,1
10797,Mrs. Fitzgerald Is Missing,2018,Not R Rated,3.3,17,Mrs. Fitzgerald Is Missing has one or more epi...,2,Prime Video,0,0,...,0,0,0,R Rated,3,2,0,"{'neg': 0.133, 'neu': 0.867, 'pos': 0.0, 'comp...",-0.6808,1
10798,The Edinburgh Show,2019,Not R Rated,3.3,17,Reports on all the hottest shows at the Edinbu...,1,BritBox,0,0,...,0,0,0,R Rated,3,1,0,"{'neg': 0.045, 'neu': 0.955, 'pos': 0.0, 'comp...",-0.2960,1
10800,Coming to the Stage,2003,Not R Rated,3.3,17,Amateur comedians strut their stand-up stuff b...,6,"Free Services,Hulu",0,0,...,0,0,1,R Rated,3,17,0,"{'neg': 0.053, 'neu': 0.81, 'pos': 0.137, 'com...",0.7845,2


In [55]:
data['number_of_platforms'].describe()

count    8792.000000
mean        1.519563
std         0.727965
min         1.000000
25%         1.000000
50%         1.000000
75%         2.000000
max         3.000000
Name: number_of_platforms, dtype: float64

In [56]:
data['multiple_platforms'] = data['number_of_platforms'].apply(lambda x: 1 if x > 1 else 0)
data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,Genre_HomeGarden,Genre_StandupTalk,R_Rated,ordinal_IMDBRating,age_of_series,r_rated,description_sentiment,sentiment,number_of_platforms,multiple_platforms
0,Breaking Bad,2008,R Rated,9.5,100,"When Walter White, a New Mexico chemistry teac...",5,Netflix,0,0,...,0,0,R Rated,10,12,1,"{'neg': 0.156, 'neu': 0.768, 'pos': 0.076, 'co...",-0.9275,1,0
1,Game of Thrones,2011,R Rated,9.3,99,Seven noble families fight for control of the ...,8,"HBO MAX,HBO",0,1,...,0,0,R Rated,9,9,1,"{'neg': 0.179, 'neu': 0.773, 'pos': 0.048, 'co...",-0.9686,2,1
2,Rick and Morty,2013,R Rated,9.2,97,Rick is a mentally-unbalanced but scientifical...,4,"Free Services,HBO MAX,Hulu",0,0,...,0,0,R Rated,9,7,1,"{'neg': 0.112, 'neu': 0.829, 'pos': 0.058, 'co...",-0.8577,3,1
3,Stranger Things,2016,Not R Rated,8.8,96,"When a young boy vanishes, a small town uncove...",3,Netflix,0,0,...,0,0,R Rated,9,4,0,"{'neg': 0.105, 'neu': 0.846, 'pos': 0.05, 'com...",-0.6378,1,0
4,The Boys,2019,R Rated,8.7,95,A group of vigilantes known informally as The ...,2,Prime Video,0,1,...,0,0,R Rated,9,1,1,"{'neg': 0.066, 'neu': 0.845, 'pos': 0.088, 'co...",0.3167,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,2016,Not R Rated,3.3,17,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,1,Funimation,1,0,...,0,0,R Rated,3,4,0,"{'neg': 0.09, 'neu': 0.877, 'pos': 0.033, 'com...",-0.4939,1,0
10797,Mrs. Fitzgerald Is Missing,2018,Not R Rated,3.3,17,Mrs. Fitzgerald Is Missing has one or more epi...,2,Prime Video,0,0,...,0,0,R Rated,3,2,0,"{'neg': 0.133, 'neu': 0.867, 'pos': 0.0, 'comp...",-0.6808,1,0
10798,The Edinburgh Show,2019,Not R Rated,3.3,17,Reports on all the hottest shows at the Edinbu...,1,BritBox,0,0,...,0,0,R Rated,3,1,0,"{'neg': 0.045, 'neu': 0.955, 'pos': 0.0, 'comp...",-0.2960,1,0
10800,Coming to the Stage,2003,Not R Rated,3.3,17,Amateur comedians strut their stand-up stuff b...,6,"Free Services,Hulu",0,0,...,0,1,R Rated,3,17,0,"{'neg': 0.053, 'neu': 0.81, 'pos': 0.137, 'com...",0.7845,2,1


In [57]:
all_platforms = ','.join(data['Streaming Platform']).lower().split(',')
platform_counts = Counter(all_platforms)
platform_counts

Counter({'free services': 3346,
         'netflix': 1759,
         'hulu': 1562,
         'prime video': 1424,
         'fubotv': 908,
         'hoopla': 622,
         'funimation': 495,
         'hbo max': 413,
         'britbox': 234,
         'hbo': 215,
         'disney+': 212,
         'acorntv': 201,
         'cbs all access': 198,
         'starz': 149,
         'tlc': 129,
         'crunchyroll': 102,
         'showtime': 80,
         'food network': 67,
         'comedy central': 66,
         'bet+': 64,
         'history': 61,
         'hgtv': 61,
         'a&e': 57,
         'mtv': 52,
         'fox': 48,
         'science': 41,
         'bravo': 41,
         'dc universe': 39,
         'adult swim': 39,
         'viceland': 36,
         'natgeo': 35,
         'nbc': 32,
         'vh1': 32,
         'lifetime': 32,
         'diy': 31,
         'trutv': 27,
         'youtube premium': 26,
         'apple tv+': 26,
         'usa': 25,
         'peacock premium': 25,
         '

In [58]:
platforms_to_extract = ['netflix', 'hulu', 'prime video']
for platform in platforms_to_extract:
    data[platform] = data['Streaming Platform'].apply(lambda x: 1 if platform in x.lower() else 0)

data

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,Fiction,Action & Adventure,...,ordinal_IMDBRating,age_of_series,r_rated,description_sentiment,sentiment,number_of_platforms,multiple_platforms,netflix,hulu,prime video
0,Breaking Bad,2008,R Rated,9.5,100,"When Walter White, a New Mexico chemistry teac...",5,Netflix,0,0,...,10,12,1,"{'neg': 0.156, 'neu': 0.768, 'pos': 0.076, 'co...",-0.9275,1,0,1,0,0
1,Game of Thrones,2011,R Rated,9.3,99,Seven noble families fight for control of the ...,8,"HBO MAX,HBO",0,1,...,9,9,1,"{'neg': 0.179, 'neu': 0.773, 'pos': 0.048, 'co...",-0.9686,2,1,0,0,0
2,Rick and Morty,2013,R Rated,9.2,97,Rick is a mentally-unbalanced but scientifical...,4,"Free Services,HBO MAX,Hulu",0,0,...,9,7,1,"{'neg': 0.112, 'neu': 0.829, 'pos': 0.058, 'co...",-0.8577,3,1,0,1,0
3,Stranger Things,2016,Not R Rated,8.8,96,"When a young boy vanishes, a small town uncove...",3,Netflix,0,0,...,9,4,0,"{'neg': 0.105, 'neu': 0.846, 'pos': 0.05, 'com...",-0.6378,1,0,1,0,0
4,The Boys,2019,R Rated,8.7,95,A group of vigilantes known informally as The ...,2,Prime Video,0,1,...,9,1,1,"{'neg': 0.066, 'neu': 0.845, 'pos': 0.088, 'co...",0.3167,1,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,2016,Not R Rated,3.3,17,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,1,Funimation,1,0,...,3,4,0,"{'neg': 0.09, 'neu': 0.877, 'pos': 0.033, 'com...",-0.4939,1,0,0,0,0
10797,Mrs. Fitzgerald Is Missing,2018,Not R Rated,3.3,17,Mrs. Fitzgerald Is Missing has one or more epi...,2,Prime Video,0,0,...,3,2,0,"{'neg': 0.133, 'neu': 0.867, 'pos': 0.0, 'comp...",-0.6808,1,0,0,0,1
10798,The Edinburgh Show,2019,Not R Rated,3.3,17,Reports on all the hottest shows at the Edinbu...,1,BritBox,0,0,...,3,1,0,"{'neg': 0.045, 'neu': 0.955, 'pos': 0.0, 'comp...",-0.2960,1,0,0,0,0
10800,Coming to the Stage,2003,Not R Rated,3.3,17,Amateur comedians strut their stand-up stuff b...,6,"Free Services,Hulu",0,0,...,3,17,0,"{'neg': 0.053, 'neu': 0.81, 'pos': 0.137, 'com...",0.7845,2,1,0,1,0


In [59]:
data_prepared = data[['Series Title', 'IMDB Rating','age_of_series', 'r_rated', 'No of Seasons', 'sentiment', 'multiple_platforms', 'netflix', 'hulu', 'prime video', 
                      'true_crime', 'world_war', 'award_winning', 'serial_killer', 'emmy_award', 'love',
                      'Genre_GameShow', 'Genre_Animation', 'Genre_Children',
                      'Genre_Crime', 'Genre_Drama', 'Genre_Anime', 'Genre_Comedy',
                      'Genre_Documentary', 'Genre_Reality', 'Genre_Fiction',
                      'Genre_ActionAdventure', 'Genre_HomeGarden', 'Genre_StandupTalk']]
data_prepared

Unnamed: 0,Series Title,IMDB Rating,age_of_series,r_rated,No of Seasons,sentiment,multiple_platforms,netflix,hulu,prime video,...,Genre_Crime,Genre_Drama,Genre_Anime,Genre_Comedy,Genre_Documentary,Genre_Reality,Genre_Fiction,Genre_ActionAdventure,Genre_HomeGarden,Genre_StandupTalk
0,Breaking Bad,9.5,12,1,5,-0.9275,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
1,Game of Thrones,9.3,9,1,8,-0.9686,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,Rick and Morty,9.2,7,1,4,-0.8577,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,Stranger Things,8.8,4,0,3,-0.6378,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,The Boys,8.7,1,1,2,0.3167,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10796,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,3.3,4,0,1,-0.4939,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
10797,Mrs. Fitzgerald Is Missing,3.3,2,0,2,-0.6808,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
10798,The Edinburgh Show,3.3,1,0,1,-0.2960,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10800,Coming to the Stage,3.3,17,0,6,0.7845,1,0,1,0,...,0,0,0,1,0,0,0,0,0,1


In [60]:
data_prepared.columns = data_prepared.columns.str.replace(' ', '_').str.replace('&', 'and').str.replace('-', '_').str.lower()
data_prepared.columns

Index(['series_title', 'imdb_rating', 'age_of_series', 'r_rated',
       'no_of_seasons', 'sentiment', 'multiple_platforms', 'netflix', 'hulu',
       'prime_video', 'true_crime', 'world_war', 'award_winning',
       'serial_killer', 'emmy_award', 'love', 'genre_gameshow',
       'genre_animation', 'genre_children', 'genre_crime', 'genre_drama',
       'genre_anime', 'genre_comedy', 'genre_documentary', 'genre_reality',
       'genre_fiction', 'genre_actionadventure', 'genre_homegarden',
       'genre_standuptalk'],
      dtype='object')

In [61]:
# Resetting the indexes

data_prepared_reset = data_prepared.reset_index(drop = True)
data_prepared_reset

Unnamed: 0,series_title,imdb_rating,age_of_series,r_rated,no_of_seasons,sentiment,multiple_platforms,netflix,hulu,prime_video,...,genre_crime,genre_drama,genre_anime,genre_comedy,genre_documentary,genre_reality,genre_fiction,genre_actionadventure,genre_homegarden,genre_standuptalk
0,Breaking Bad,9.5,12,1,5,-0.9275,0,1,0,0,...,1,1,0,0,0,0,0,0,0,0
1,Game of Thrones,9.3,9,1,8,-0.9686,1,0,0,0,...,0,1,0,0,0,0,0,1,0,0
2,Rick and Morty,9.2,7,1,4,-0.8577,1,0,1,0,...,0,0,0,1,0,0,0,0,0,0
3,Stranger Things,8.8,4,0,3,-0.6378,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
4,The Boys,8.7,1,1,2,0.3167,0,0,0,1,...,0,0,0,1,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8787,Time Travel Shoujo: Mari Waka to 8-nin no Kaga...,3.3,4,0,1,-0.4939,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
8788,Mrs. Fitzgerald Is Missing,3.3,2,0,2,-0.6808,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
8789,The Edinburgh Show,3.3,1,0,1,-0.2960,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8790,Coming to the Stage,3.3,17,0,6,0.7845,1,0,1,0,...,0,0,0,1,0,0,0,0,0,1


In [62]:
data_prepared_reset.to_csv("../data/series_data_prepared.csv", index = False)