Necessary libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pandas_profiling import ProfileReport
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

In [None]:
file_paths = ['resources/1948 - 1986.csv', 'resources/1987 - 2022.csv']
data = pd.concat(map(pd.read_csv, file_paths))
df = pd.DataFrame(data)
df

Exploratory data analysis before cleaning

In [None]:
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("reports/before_report.html")

Data cleaning

In [None]:
# Drop duplicates
df.drop_duplicates('Title', inplace=True)

# Rows without a defined technique are eliminated
df = df[~df.Technique.isnull()].copy()

# Null values
df['Episodes'] = df['Episodes'].replace(np.nan, 1)
df['Seasons'] = df['Seasons'].replace(np.nan, 1)
df['Original Channel'] = df['Original Channel'].replace(np.nan, 'Unidentified')

# Non-numeric values in numeric rows
df['Seasons'] = pd.to_numeric(df['Seasons'], errors = 'coerce')
df['Episodes'] = pd.to_numeric(df['Episodes'], errors = 'coerce')
df.dropna(inplace = True)

# If the final year is 'Present' it is replaced by 2023
df['Final Year'] = df.loc[df['Final Year'] == 'Present', 'Final Year'] = 2023

# Float to int
df['Episodes'] = np.int_(df['Episodes'])
df['Seasons'] = np.int_(df['Seasons'])
df['Final Year'] = np.int_(df['Final Year'])

df

Exploratory data analysis after cleaning

In [None]:
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_file("reports/after_report.html")

**Clustering: K-Means (relationship between the premiere year and the technique used)**

Convert the used technique into a numeric value

In [None]:
le = LabelEncoder()
technique_label = le.fit_transform(df['Technique'])

Define array to process

In [None]:
x = technique_label
y = df['Premiere Year'].values
X = np.array(list(zip(x, y)))

Comparison between the technique and the corresponding value

In [None]:
info = np.array(list(zip(x, df['Technique'])))
info_sorted = info[np.argsort(info[:, 1])]

# Unique rows
unique_rows = np.unique(info_sorted, axis=0)
unique_rows

Clusters and labels

In [None]:
kmeans = KMeans(n_clusters=3)
kmeans = kmeans.fit(X)
labels = kmeans.predict(X)

Centroids and colors

In [None]:
centroids = kmeans.cluster_centers_
colors = ["m.", "r.","c.", "y.", "b."]

Label-based coordinate mapping

In [None]:
for i in range(len(X)):
    #print("Coordinate: ", X[i], " Label: ", labels[i])
    plt.plot(X[i][0], X[i][1], colors[labels[i]], markersize=10)

plt.scatter(centroids[:,0], centroids[:,1], marker='*', s=150, linewidths=5, zorder=10)
plt.show()