In [85]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import cdist

In [87]:
# read in csv & display first 5 rows
kdramaData = pd.read_csv("kdramas_dataset.csv")
kdramaData.head()

Unnamed: 0,Rank,Title,Year of release,Number of Episodes,Rating,Description,Genre,Tags,Actors
0,#1,Move to Heaven,2021,10,9.2,Geu Roo is a young autistic man. He works for ...,"Life, Drama, Family","Autism, Uncle-Nephew Relationship, Death, Sava...","Lee Je Hoon, Tang Jun Sang, Hong Seung Hee, Ju..."
1,#2,Twinkling Watermelon,2023,16,9.2,"In 2023, high school student Eun Gyeol, a CODA...","Romance, Youth, Drama, Fantasy","Time Travel, Child From The Future, Sign Langu...","Ha Eun-Gyeol, Ha Yi-Chan, Choi Se-Kyung, Yoon ..."
2,#3,Moving,2023,20,9.1,"Kim Bong Seok, Jang Hui Su, and Lee Gang Hun, ...","Action, Thriller, Mystery, Supernatural","Graphic Violence, Supernatural Power, Multiple...","Ryu Seung Ryong, Han Hyo Joo, Zo In Sung"
3,#4,The Trauma Code: Heroes on Call,2025,8,9.1,"Baek Gang Hyeok, a genius trauma surgeon with ...","Action, Comedy, Drama, Medical","Surgeon Male Lead, Hospital Setting, Mentor-Me...","Ju Ji Hoon, Choo Young Woo, Ha Young, Yoon Gyu..."
4,#5,Flower of Evil,2020,16,9.1,Although Baek Hee Sung is hiding a dark secret...,"Thriller, Romance, Crime, Melodrama","Married Couple, Deception, Suspense, Family Se...","Lee Joon Gi, Moon Chae Won, Jang Hee Jin, Seo ..."


In [89]:
# display data info
kdramaData.info()
# seems to have no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Rank                350 non-null    object 
 1   Title               350 non-null    object 
 2   Year of release     350 non-null    int64  
 3   Number of Episodes  350 non-null    int64  
 4   Rating              350 non-null    float64
 5   Description         350 non-null    object 
 6   Genre               350 non-null    object 
 7   Tags                350 non-null    object 
 8   Actors              350 non-null    object 
dtypes: float64(1), int64(2), object(6)
memory usage: 24.7+ KB


In [91]:
# General Statistical Overview
# rounded to the nearest tenth
kdramaData.describe().drop(index='count').round(1)

Unnamed: 0,Year of release,Number of Episodes,Rating
mean,2019.8,18.3,8.4
std,3.7,14.6,0.5
min,2003.0,1.0,6.4
25%,2018.0,12.0,8.3
50%,2020.0,16.0,8.4
75%,2023.0,16.0,8.6
max,2025.0,133.0,9.2


In [93]:
# cleaning and rebuilding dataframe for genres and ratings features
kdramaData['Genre'] = kdramaData['Genre'].apply(lambda x: [genre.strip() for genre in x.split(',')] if isinstance(x, str) else [])

all_genres = set([genre for sublist in kdramaData['Genre'] for genre in sublist])

for genre in all_genres:
    kdramaData[genre] = kdramaData['Genre'].apply(lambda x: 1 if genre in x else 0)

max_rating = kdramaData['Rating'].max()
kdramaData['Normalized_Rating'] = kdramaData['Rating'] / max_rating

genre_columns = list(all_genres)
df_features = kdramaData[genre_columns + ['Normalized_Rating']]

In [95]:
query_drama_title = "Move to Heaven"

In [97]:
target_drama = df_features.loc[kdramaData['Title'] == query_drama_title].iloc[0]

# euclidean distance
distances = cdist(df_features, [target_drama], metric="euclidean").flatten()

In [99]:
query_distances = list(zip(kdramaData['Title'], distances))

top_10_similar = sorted(query_distances, key=lambda x: x[1])[1:11]

In [101]:
for similar_drama, distance in top_10_similar:
    print(f"Title: {similar_drama}, Distance: {distance}")

Title: My Mister, Distance: 1.0000590719792581
Title: Navillera, Distance: 1.0002362669849862
Title: The Good Bad Mother, Distance: 1.0005315222581126
Title: Dear My Friends, Distance: 1.0014757541749966
Title: My Unfamiliar Family, Distance: 1.003773598314365
Title: Work Later, Drink Now, Distance: 1.4162588834506002
Title: Lost, Distance: 1.4168844118956803
Title: One Day Off , Distance: 1.4175930104544199
Title: Agency, Distance: 1.4183845546259704
Title: Prison Playbook, Distance: 1.7320849134635448
