In [44]:
import pandas as pd
import numpy as np
import statistics
import regex as re
from scipy import stats

import nltk
from nltk.util import ngrams
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from collections import Counter

# Download NLTK stopwords if not already
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from statsmodels.miscmodels.ordinal_model import OrderedModel
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

from scipy import stats

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/karolina/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<p style="font-size:30px">
Reading the Data
</p>

<div style="font-size:15px">
The dataset contains information about TV series, including:

- 📅 Year of release  
- 🔞 Content rating  
- ⭐ IMDB and R ratings  
- 🎭 Genre  
- 📝 Description of the show  
- 📈 Number of seasons  
- 📺 Available streaming platforms  

The dataset includes **12,353 observations**.
</div>

In [45]:
data = pd.read_csv("data/All_Streaming_Shows.csv")

In [46]:
data.head()

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Genre,Description,No of Seasons,Streaming Platform
0,Breaking Bad,2008,18+,9.5,100,"Crime,Drama","When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix
1,Game of Thrones,2011,18+,9.3,99,"Action & Adventure,Drama",Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO"
2,Rick and Morty,2013,18+,9.2,97,"Animation,Comedy",Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu"
3,Stranger Things,2016,16+,8.8,96,"Drama,Fantasy","When a young boy vanishes, a small town uncove...",3Seasons,Netflix
4,The Boys,2019,18+,8.7,95,"Action & Adventure,Comedy",A group of vigilantes known informally as “The...,2Seasons,Prime Video


In [47]:
data.shape

(12353, 9)

<p style="font-size:30px">
Data Preparation
</p>


There are multiple missing values in the following columns:

- **Content Rating**
- **Streaming Platform**
- **IMDB Rating**

#### 🔧 IMDB Rating
Missing values in the **IMDB Rating** column were replaced using the **mode**, grouped by the corresponding **R Rating** values.  
Observations where mode imputation was not possible were dropped from the dataset.

#### 🏷️ Streaming Platform
Missing values in the **Streaming Platform** column were replaced with `"Unknown"` to allow for modeling.

#### 🆔 Content Rating
Missing values in the **Content Rating** column were handled by incorporating them into a new dummy variable: `"Not R Rated"`.

### ❌ Error Value Handling

The **Genre**, **Description**, and **R Rating** columns contained multiple invalid observations marked as `"-1"`.  
After removing all rows with these error values, **9,371 observations** remain.


In [48]:
data.describe(include=object)

Unnamed: 0,Series Title,Content Rating,Genre,Description,No of Seasons,Streaming Platform
count,12353,7232,12353,12353,12353,10370
unique,12109,5,858,11875,94,530
top,Kingdom,16+,-1,-1,1Season,Netflix
freq,4,2581,479,479,5204,1427


In [49]:
data.dtypes

Series Title           object
Year Released           int64
Content Rating         object
IMDB Rating           float64
R Rating                int64
Genre                  object
Description            object
No of Seasons          object
Streaming Platform     object
dtype: object

In [50]:
empty_data=data.isnull().any(axis=1)
null_rows=data[empty_data]
print(null_rows)

                            Series Title  Year Released Content Rating  \
186                               Castle           2009            16+   
250                             Gomorrah           2014            18+   
337                       Masters of Sex           2013            18+   
345                                Louie           2010            18+   
350                            Continuum           2012            16+   
...                                  ...            ...            ...   
12348  A Fishing Story with Ronnie Green           2017            NaN   
12349                  CMT Most Shocking           2003            NaN   
12350   NHL Road to the Outdoor Classics           2016            NaN   
12351                         Addy Media           2018            NaN   
12352             My Dream Derelict Home           2014            NaN   

       IMDB Rating  R Rating                     Genre  \
186            8.1        83              Comedy,Crim

In [51]:
data['R Rating'].unique()

array([100,  99,  97,  96,  95,  94,  93,  92,  91,  90,  89,  88,  87,
        86,  85,  84,  83,  82,  81,  80,  79,  78,  77,  76,  75,  74,
        73,  72,  71,  70,  69,  68,  67,  66,  65,  64,  63,  62,  61,
        60,  59,  58,  57,  56,  55,  54,  53,  52,  51,  50,  49,  48,
        47,  46,  45,  44,  43,  42,  41,  40,  39,  38,  37,  36,  35,
        34,  33,  32,  31,  30,  29,  28,  27,  26,  25,  24,  23,  22,
        21,  20,  19,  18,  17,  16,  15,  13,  10,  -1])

In [52]:
# We have 9371 rows with normal values
len(data[(data["Genre"]!="-1") & (data["R Rating"]!=-1) & (data["Description"]!="-1") & (data["Streaming Platform"])])


9371

In [53]:
# We filter out "-1" values from the dataset
data=data[(data["Genre"]!="-1") & (data["R Rating"]!=-1) & (data["Description"]!="-1") & (data["Streaming Platform"])]

In [54]:
# We replace empty values in "IMDB Rating" with their mode values based on the R Rating score
data["IMDB Rating"]=data.groupby("R Rating")["IMDB Rating"].transform(lambda x: x.fillna(x.mode()[0] if not x.mode().empty else np.nan))

In [55]:
# We drop NaN values from "IMDB Rating"
data.dropna(subset=["IMDB Rating"], inplace=True)


### Cleaning and Encoding Categorical Columns



Several columns with `object` data type — namely **Genre**, **Streaming Platform**, and **Content Rating** — were cleaned and preprocessed before modeling in the following way:

- Removed 4-digit year values using regex
- Removed plus signs (`+`)
- Replaced `"-1"` values with `"Unknown"` 
- Cleaned leading/trailing commas with regex
- Stripped extra whitespace
- Replaced empty strings with `"Unknown"`

**Content Rating** was split into two categories: "R Rated" and "Not R Rated".

**Genre** and other categorical columns were one-hot encoded using dummy variables.

**Number of Seasons** was converted to a numeric type.


In [56]:
data["Genre"].unique()

array(['Crime,Drama', 'Action & Adventure,Drama', 'Animation,Comedy',
       'Drama,Fantasy', 'Action & Adventure,Comedy', 'Drama,History',
       'Action & Adventure,Crime', 'Action & Adventure,Animation',
       'Comedy,Romance', 'Comedy,2005', 'Comedy,Crime',
       'Action & Adventure,Anime', 'Drama,Science-Fiction',
       'Drama,Mystery', 'Fantasy,Crime', 'Thriller,Action & Adventure',
       'Drama,Thriller', 'Action & Adventure,Science-Fiction',
       'Drama,Horror', 'Comedy,LGBTQ', 'Comedy,2009', 'Drama,2007',
       'Comedy,Drama', 'Drama,Comedy', 'Documentary,Biography',
       'Comedy,Stand-up & Talk', 'Horror,Drama',
       'Drama,Action & Adventure', 'Drama,Romance', 'Drama,Sport',
       'Thriller,Biography', 'Science-Fiction,Animation', 'Comedy,2003',
       'Action & Adventure,Thriller', 'Thriller,LGBTQ', 'Comedy,2014',
       'Drama,2018', 'Drama,LGBTQ', 'Drama,Crime',
       'Action & Adventure,Fantasy', 'Comedy,2000', 'Biography,Drama',
       'Action & Adventure,H

In [57]:
# We make a set of unique streaming platforms
all_platforms=[]
for i in data["Streaming Platform"]:
    a=i.split(",")
    all_platforms.extend(a)

all_platforms=set(all_platforms)
print(all_platforms)


{'History', 'YouTube Premium', 'AcornTV', 'DC Universe', 'NBC', 'Science', 'BET+', 'HBO', 'TLC', 'BBC America', 'FOX', 'Nick', 'Starz', 'Sundance', 'BET', 'TBS', 'IFC', 'HBO MAX', 'Peacock Premium', 'Cinemax', 'Cartoon Network', 'CBS All Access', 'Hoopla', 'Disney+', 'Syfy', 'TruTV', 'Hallmark Movies Now', 'CNBC', 'IndieFlix', 'ABC', 'Apple TV+', 'DIY', 'Disney', 'TVLand', 'Crunchyroll', 'Prime Video', 'Adult Swim', 'Epix', 'HGTV', 'Shudder', 'A&E', 'Bravo', 'Hallmark', 'NatGeo', 'Hulu', 'Netflix', 'Showtime', 'fuboTV', 'USA', 'FX', 'VH1', 'Funimation', 'Comedy Central', 'AMC', 'Food Network', 'TNT', 'Travel Channel', 'FYI', 'Viceland', 'Free Services', 'MTV', 'AMC Premiere', 'BritBox', 'Lifetime'}


In [58]:
# Cleaning "Genre" column & creating dummy variables to replace different genres
for i in all_platforms:
    data["Genre"]=data["Genre"].str.replace(i,"")

data["Genre"]=data["Genre"].str.replace(r'\b\d{4}\b', '', regex=True)
data["Genre"]=data["Genre"].str.replace(r'\+', '', regex=True)
data["Genre"]=data["Genre"].apply(lambda x:"Unknown" if x=="-1" else x)
data["Genre"]=data["Genre"].str.replace(r'^,|(?<=,),$|,$', '', regex=True)
data["Genre"]=data["Genre"].str.strip()
data["Genre"]=data["Genre"].apply(lambda x:"Unknown" if x.strip()=="" else x)

data = data.join(
    data["Genre"].str.get_dummies(sep=',')
)

data=data.drop(columns=["Genre"])
data.head()

Unnamed: 0,Series Title,Year Released,Content Rating,IMDB Rating,R Rating,Description,No of Seasons,Streaming Platform,-Fiction,Action & Adventure,...,MAX,Musical,Mystery,Reality,Romance,Sport,Stand-up & Talk,Thriller,Travel,Unknown
0,Breaking Bad,2008,18+,9.5,100,"When Walter White, a New Mexico chemistry teac...",5Seasons,Netflix,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Game of Thrones,2011,18+,9.3,99,Seven noble families fight for control of the ...,8Seasons,"HBO MAX,HBO",0,1,...,0,0,0,0,0,0,0,0,0,0
2,Rick and Morty,2013,18+,9.2,97,Rick is a mentally-unbalanced but scientifical...,4Seasons,"Free Services,HBO MAX,Hulu",0,0,...,0,0,0,0,0,0,0,0,0,0
3,Stranger Things,2016,16+,8.8,96,"When a young boy vanishes, a small town uncove...",3Seasons,Netflix,0,0,...,0,0,0,0,0,0,0,0,0,0
4,The Boys,2019,18+,8.7,95,A group of vigilantes known informally as “The...,2Seasons,Prime Video,0,1,...,0,0,0,0,0,0,0,0,0,0


In [59]:
# Content Rating variable
data["Content Rating"].unique()

array(['18+', '16+', '7+', 'all', nan, '13+'], dtype=object)

In [60]:
data["Content Rating"]=data["Content Rating"].str.replace("+", "")
data["Content Rating"]=data["Content Rating"]=data["Content Rating"].transform(lambda x: x.fillna("0"))
data["Content Rating"]=data["Content Rating"].str.replace("all", "0")
data["Content Rating"]=data["Content Rating"].apply(lambda x:"R Rated" if x=="18" else "Not R Rated")

In [61]:
dummies=pd.get_dummies(data["Content Rating"])
data=pd.concat([data, pd.get_dummies(data["Content Rating"], prefix="Content_Rating")], axis=1)
list(data.columns.values)

['Series Title',
 'Year Released',
 'Content Rating',
 'IMDB Rating',
 'R Rating',
 'Description',
 'No of Seasons',
 'Streaming Platform',
 '-Fiction',
 'Action & Adventure',
 'Animation',
 'Anime',
 'Biography',
 'Children',
 'Comedy',
 'Crime',
 'Cult',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Food',
 'Game Show',
 'Home & Garden',
 'Horror',
 'LGBTQ',
 'MAX',
 'Musical',
 'Mystery',
 'Reality',
 'Romance',
 'Sport',
 'Stand-up & Talk',
 'Thriller',
 'Travel',
 'Unknown',
 'Content_Rating_Not R Rated',
 'Content_Rating_R Rated']

In [62]:
# Seasons variable
data["No of Seasons"].unique()

array(['5Seasons', '8Seasons', '4Seasons', '3Seasons', '2Seasons',
       '1Season', '10Seasons', '9Seasons', '15 Seasons', '1 Season',
       '5 Seasons', '31 Seasons', '6Seasons', '6 Seasons', '7 Seasons',
       '23 Seasons', '7Seasons', '10 Seasons', '12Seasons', '11 Seasons',
       '14Seasons', '3 Seasons', '4 Seasons', '16 Seasons', '11Seasons',
       '2 Seasons', '9 Seasons', '8 Seasons', '24 Seasons', '19 Seasons',
       '18 Seasons', '17 Seasons', '12 Seasons', '21 Seasons',
       '16Seasons', '28 Seasons', '13Seasons', '29 Seasons', '45 Seasons',
       '21Seasons', '27 Seasons', '25 Seasons', '40 Seasons', '20Seasons',
       '14 Seasons', '51 Seasons', '13 Seasons', '15Seasons',
       '20 Seasons', '24Seasons', '62 Seasons', '35 Seasons',
       '44 Seasons', '18Seasons', '17Seasons', '75 Seasons', '54 Seasons',
       '27Seasons', '22 Seasons', '32Seasons', '187 Seasons', '31Seasons',
       '36 Seasons', '26 Seasons', '32 Seasons', '52 Seasons',
       '34 Seasons', 

In [63]:
data["No of Seasons"]=data["No of Seasons"].str.replace(r"\D", "", regex=True)
data["No of Seasons"]=pd.to_numeric(data["No of Seasons"])

In [64]:
data["No of Seasons"].unique()

array([  5,   8,   4,   3,   2,   1,  10,   9,  15,  31,   6,   7,  23,
        12,  11,  14,  16,  24,  19,  18,  17,  21,  28,  13,  29,  45,
        27,  25,  40,  20,  51,  62,  35,  44,  75,  54,  22,  32, 187,
        36,  26,  52,  34,  37,  77,  41,  42,  60,  33, 160,  38,  84,
        39,  43,  48,  71,  82,  57])

In [65]:
# Streaming Platform
data["Streaming Platform"].unique()
data["Platform Count"]=data["Streaming Platform"].apply(lambda x: len(x.split(", ")))


In [66]:
# Spearman Correlation
# The correlation is small, we can keep both variables
correlation=stats.spearmanr(data["Reality"], data["Documentary"])
print(correlation)

SignificanceResult(statistic=np.float64(0.09808792944352131), pvalue=np.float64(2.9770847435335363e-20))


The **dupliactes** are removed. After performing all of the data cleaning operations there are no null values.

In [67]:
# Deleting duplicates
data=data.drop_duplicates()

In [68]:
# No empty values
data.isnull().values.any()

np.False_

### 📝 Preparing the **Description** Column

The **Description** column is preprocessed for modeling using basic text mining techniques:

- Common **stop words** are removed.
- The text is **tokenized** into individual words.
- From the tokenized text,**unigrams**, **bigrams**, **trigrams**, and **four-grams** are extracted.
- The most **frequent and meaningful n-grams** are selected and converted into **binary features** 

This process captures useful textual patterns that may help improve predictive performance.

For example, words such as "featuring" can communicate that the show includes a famous actor and that can in turn lead to higher ratings.

In [69]:
# Combine built-in and custom stop words
custom_stop_words = set([
    "movie", "about", "with", "will", "that", "leave", "episodes",
    "imdb", "season", "seasons", "hulu", "prime"
])
stop_words = set(stopwords.words('english')).union(ENGLISH_STOP_WORDS).union(custom_stop_words)

In [70]:
# Clean and tokenize function
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = text.split()
    return [word for word in tokens if word not in stop_words]

# N-gram extraction function
def extract_ngrams(df, n):
    all_ngrams = []

    for description in df['Description'].dropna():
        tokens = clean_and_tokenize(description)
        n_gram_list = ngrams(tokens, n)
        filtered = [
            ' '.join(gram) for gram in n_gram_list
            if all(word not in stop_words for word in gram)
        ]
        all_ngrams.extend(filtered)

    # Count frequency
    ngram_counts = Counter(all_ngrams)

    # Convert to DataFrame
    ngram_df = pd.DataFrame(ngram_counts.items(), columns=['ngram', 'count'])
    ngram_df = ngram_df.sort_values(by='count', ascending=False).reset_index(drop=True)

    return ngram_df

In [71]:
# Extract and count bigrams
bigrams = extract_ngrams(data, 2)
print("Most common bigrams:")
print(bigrams.head(10))

Most common bigrams:
                    ngram  count
0         audience rating   8291
1            rating votes   8257
2  streaming subscription   8220
3        average audience   5189
4      available purchase   5088
5                 air new   4703
6          longer running   4701
7           running plans   4701
8               plans air   4701
9        airing announced   3940


In [72]:
# Extract and count trigrams
trigrams = extract_ngrams(data, 3)
print("\nMost common trigrams:")
print(trigrams.head(10))


Most common trigrams:
                       ngram  count
0      audience rating votes   8257
1    average audience rating   5189
2       longer running plans   4701
3          running plans air   4701
4              plans air new   4701
5     announced date episode   3940
6      airing announced date   3940
7       high audience rating   2867
8    better average audience   2836
9  available purchase itunes   2375


In [73]:
# Extract and count 4-grams
fourgrams = extract_ngrams(data, 4)
print("\nMost common 4-grams:")
print(fourgrams.head(10))


Most common 4-grams:
                            ngram  count
0   average audience rating votes   5167
1           running plans air new   4701
2        longer running plans air   4701
3   airing announced date episode   3940
4      high audience rating votes   2856
5  better average audience rating   2836
6           air new high audience   1705
7        new high audience rating   1705
8              plans air new high   1705
9     new better average audience   1695


In [75]:
# Tokenization
nltk.download('punkt_tab')

def tokenize_and_filter(text):
    words = word_tokenize(text.lower())
    filtered_words = [
        word for word in words
        if word.isalpha()
        and word not in stop_words
        and word not in custom_stop_words
    ]
    return filtered_words

data['keywords'] = data['Description'].apply(tokenize_and_filter)

df_keywords = data.explode('keywords')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/karolina/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [76]:
word_frequencies = df_keywords['keywords'].value_counts().reset_index()
word_frequencies.columns = ['word', 'count']
print(word_frequencies.head(10))

           word  count
0     streaming  10774
1      audience   8351
2        rating   8305
3         votes   8260
4  subscription   8226
5           new   6356
6     featuring   6166
7       average   5241
8     available   5152
9      purchase   5104


In [77]:
# List of common phrases
common_phrases = [
    "true crime", "food network", "world war", "award winning",
    "serial killer", "emmy award", "featuring", "family", "love"
]

data['Description'] = data['Description'].str.encode('ascii', errors='ignore').str.decode('ascii')

data['true_crime'] = data['Description'].str.contains(common_phrases[0], case=False, na=False).astype(int)
data['world_war'] = data['Description'].str.contains(common_phrases[2], case=False, na=False).astype(int)
data['award_winning'] = data['Description'].str.contains(common_phrases[3], case=False, na=False).astype(int)
data['serial_killer'] = data['Description'].str.contains(common_phrases[4], case=False, na=False).astype(int)
data['emmy_award'] = data['Description'].str.contains(common_phrases[5], case=False, na=False).astype(int)
data['featuring'] = data['Description'].str.contains(common_phrases[6], case=False, na=False).astype(int)
data['love'] = data['Description'].str.contains(common_phrases[8], case=False, na=False).astype(int)

print(data.columns)

data['R_Rating'] = pd.to_numeric(data['R Rating'], errors='coerce')
data['Genre_GameShow'] = data['Game Show']
data['Genre_Animation'] = data['Animation']
data['Genre_Children'] = data['Children']
data['Genre_Crime'] = data['Crime']
data['Genre_Drama'] = data['Drama']
data['Genre_Anime'] = data['Anime']
data['Genre_Comedy'] = data['Comedy']
data['Genre_Documentary'] = data['Documentary']
data['Genre_Reality'] = data['Reality']
data['Genre_Fiction'] = data['-Fiction']
data['Genre_ActionAdventure'] = data['Action & Adventure']
data['Genre_HomeGarden'] = data['Home & Garden']
data['Genre_StandupTalk'] = data['Stand-up & Talk']
data['R_Rated'] = 'R Rated'

data['ordinal_IMDBRating'] = pd.Categorical(data['IMDB Rating'].round().astype(int), ordered=True)


Index(['Series Title', 'Year Released', 'Content Rating', 'IMDB Rating',
       'R Rating', 'Description', 'No of Seasons', 'Streaming Platform',
       '-Fiction', 'Action & Adventure', 'Animation', 'Anime', 'Biography',
       'Children', 'Comedy', 'Crime', 'Cult', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Food', 'Game Show', 'Home & Garden', 'Horror', 'LGBTQ',
       'MAX', 'Musical', 'Mystery', 'Reality', 'Romance', 'Sport',
       'Stand-up & Talk', 'Thriller', 'Travel', 'Unknown',
       'Content_Rating_Not R Rated', 'Content_Rating_R Rated',
       'Platform Count', 'keywords', 'true_crime', 'world_war',
       'award_winning', 'serial_killer', 'emmy_award', 'featuring', 'love'],
      dtype='object')


In [78]:
data.columns


Index(['Series Title', 'Year Released', 'Content Rating', 'IMDB Rating',
       'R Rating', 'Description', 'No of Seasons', 'Streaming Platform',
       '-Fiction', 'Action & Adventure', 'Animation', 'Anime', 'Biography',
       'Children', 'Comedy', 'Crime', 'Cult', 'Documentary', 'Drama', 'Family',
       'Fantasy', 'Food', 'Game Show', 'Home & Garden', 'Horror', 'LGBTQ',
       'MAX', 'Musical', 'Mystery', 'Reality', 'Romance', 'Sport',
       'Stand-up & Talk', 'Thriller', 'Travel', 'Unknown',
       'Content_Rating_Not R Rated', 'Content_Rating_R Rated',
       'Platform Count', 'keywords', 'true_crime', 'world_war',
       'award_winning', 'serial_killer', 'emmy_award', 'featuring', 'love',
       'R_Rating', 'Genre_GameShow', 'Genre_Animation', 'Genre_Children',
       'Genre_Crime', 'Genre_Drama', 'Genre_Anime', 'Genre_Comedy',
       'Genre_Documentary', 'Genre_Reality', 'Genre_Fiction',
       'Genre_ActionAdventure', 'Genre_HomeGarden', 'Genre_StandupTalk',
       'R_Rated',