<a href="https://www.kaggle.com/code/izzulroslan/eda-korean-dramas-dataset?scriptVersionId=231821708" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("saikalbatyrbekova/korean-dramas-dataset-eda")

print("Path to dataset files:", path)


# Import Libraries and Load Dataset

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


# Use the correct path to the downloaded file
file_path = "/kaggle/input/korean-dramas-dataset-eda/kdrama_DATASET.csv"

# Load dataset
df = pd.read_csv(file_path)

# Display first few rows
df.head()


In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
# Check for duplicate titles
df["Title"].duplicated().sum()


# Data Visualizations

In [None]:
top_dramas = df.sort_values(by="Rating", ascending=False).head(10)
print(top_dramas[["Title", "Rating", "Year of release"]])


In [None]:
# Group data by year and calculate the average rating
df.groupby("Year of release")["Rating"].mean().plot(kind="line", marker="o", figsize=(10, 5))

plt.title("Average K-Drama Ratings Over the Years")
plt.xlabel("Year of Release")
plt.ylabel("Average Rating")
plt.show()


In [None]:
from collections import Counter

genre_list = df["Genre"].dropna().str.split(", ")
flat_list = [genre for sublist in genre_list for genre in sublist]
genre_counts = Counter(flat_list)

# Convert to DataFrame and plot
genre_df = pd.DataFrame(genre_counts.items(), columns=["Genre", "Count"]).sort_values(by="Count", ascending=False)

genre_df.plot(kind="bar", x="Genre", y="Count", figsize=(10, 5), legend=False)
plt.title("Most Common K-Drama Genres")
plt.show()


In [None]:
# Convert Counter dictionary to two lists: genres and counts
genres, counts = zip(*genre_counts.most_common(10))  # Top 10 genres

# Plot bar chart
plt.figure(figsize=(10, 5))
plt.bar(genres, counts, color="skyblue")
plt.xlabel("Genres")
plt.ylabel("Count")
plt.title("Top 10 Common K-Drama Genres")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()


In [None]:
df["Rating"] = pd.to_numeric(df["Rating"], errors="coerce")  # Convert Rating to number

# Explode genre column to handle multiple genres per drama
df_exploded = df.assign(Genre=df["Genre"].str.split(", ")).explode("Genre")

# Calculate average rating per genre
avg_genre_rating = df_exploded.groupby("Genre")["Rating"].mean().sort_values(ascending=False)

print("Top 10 Average Rating by Genre\n", avg_genre_rating.head(10))  # Show top 10 genres


In [None]:
# Explode actors column to handle multiple actors per drama
df_exploded_actors = df.assign(Actors=df["Actors"].str.split(", ")).explode("Actors")

# Count occurrences of each actor
actor_counts = Counter(df_exploded_actors["Actors"])

# Get top 10 most frequent actors
top_actors = actor_counts.most_common(10)

# Convert to two lists for plotting
actors, counts = zip(*top_actors)

# Plot bar chart
plt.figure(figsize=(10, 5))
plt.barh(actors, counts, color="skyblue")
plt.xlabel("Number of Dramas")
plt.ylabel("Actors")
plt.title("Top 10 Most Frequent K-Drama Actors")
plt.gca().invert_yaxis()  # Invert y-axis to show highest count on top
plt.show()


In [None]:
# Explode the "Actors" column to separate multiple actors per drama
df_exploded_actors = df.assign(Actors=df["Actors"].str.split(", ")).explode("Actors")

# Group by actor and calculate average rating
actor_avg_ratings = df_exploded_actors.groupby("Actors")["Rating"].mean().reset_index()

# Sort by highest average rating and get top 10 actors
top_actors = actor_avg_ratings.sort_values(by="Rating", ascending=False).head(10)

# Plot bar chart
plt.figure(figsize=(12, 6))
sns.barplot(y=top_actors["Actors"], x=top_actors["Rating"], palette="Blues_r")
plt.xlabel("Average Rating")
plt.ylabel("Actors")
plt.title("Top 10 Actors with Highest Average K-Drama Ratings")
plt.xlim(8, 10)  # Adjust scale for better readability
plt.show()


In [None]:
# Count number of dramas per year
drama_per_year = df["Year of release"].value_counts().sort_index()

# Plot trend
plt.figure(figsize=(10, 5))
plt.plot(drama_per_year.index, drama_per_year.values, marker="o", linestyle="-", color="blue")
plt.xlabel("Year")
plt.ylabel("Number of Dramas")
plt.title("K-Drama Production Over the Years")
plt.grid()
plt.show()


In [None]:
# Plot histogram of ratings
plt.figure(figsize=(8,5))
sns.histplot(df["Rating"], bins=10, kde=True, color="blue")
plt.xlabel("Rating")
plt.ylabel("Number of Dramas")
plt.title("Distribution of K-Drama Ratings")
plt.show()


In [None]:
plt.figure(figsize=(8,5))
sns.scatterplot(x=df["Number of Episodes"], y=df["Rating"], alpha=0.7)
plt.xlabel("Number of Episodes")
plt.ylabel("Rating")
plt.title("Number of Episodes vs. Rating")
plt.grid()
plt.show()


In [None]:
# Explode tags column
df_exploded_tags = df.assign(Tags=df["Tags"].str.split(", ")).explode("Tags")

# Count occurrences
tag_counts = Counter(df_exploded_tags["Tags"])

# Convert to DataFrame
df_tags = pd.DataFrame(tag_counts.most_common(10), columns=["Tag", "Count"])

# Plot bar chart
plt.figure(figsize=(10,5))
sns.barplot(y=df_tags["Tag"], x=df_tags["Count"], palette="coolwarm")
plt.xlabel("Count")
plt.ylabel("Tags")
plt.title("Top 10 Most Common K-Drama Tags")
plt.show()


# Summary of Key Insights

In [None]:
print("Key Insights:\n")

# General stats
total_dramas = df.shape[0]
unique_titles = df['Title'].nunique()
average_rating = df['Rating'].mean()
print(f"Total Dramas: {total_dramas}")
print(f"Unique Titles: {unique_titles}")
print(f"Average Rating: {average_rating:.2f}")

# Most common genre
most_common_genre = df['Genre'].str.split(', ').explode().mode()[0]
print(f"Most Popular Genre: {most_common_genre}")

# Highest rated drama
top_drama = df.loc[df['Rating'].idxmax()]
print(f"Highest Rated Drama: {top_drama['Title']} ({top_drama['Rating']})")

# Most frequently appearing actor
most_common_actor = df['Actors'].str.split(', ').explode().mode()[0]
print(f"Most Frequently Appearing Actor: {most_common_actor}")

# Average rating per actor (corrected)
df_exploded = df.assign(Actors=df['Actors'].str.split(', ')).explode('Actors')  # Properly split and explode actors
actor_ratings = df_exploded.groupby('Actors')['Rating'].mean().sort_values(ascending=False)

print("\nTop 5 Actors with Highest Average Ratings:")
print(actor_ratings.head(5))

# Drama length analysis
avg_episodes = df['Number of Episodes'].mean()
longest_drama = df.loc[df['Number of Episodes'].idxmax()]
shortest_drama = df.loc[df['Number of Episodes'].idxmin()]
print(f"\nAverage Number of Episodes per Drama: {avg_episodes:.2f}")
print(f"Longest Drama: {longest_drama['Title']} ({longest_drama['Number of Episodes']} episodes)")
print(f"Shortest Drama: {shortest_drama['Title']} ({shortest_drama['Number of Episodes']} episodes)")

# Most commonly used tags
most_common_tag = df['Tags'].str.split(', ').explode().mode()[0]
print(f"Most Common Tag: {most_common_tag}")

# Year with the most drama releases
most_active_year = df['Year of release'].mode()[0]
dramas_in_active_year = df[df['Year of release'] == most_active_year].shape[0]
print(f"Year with Most Dramas: {most_active_year} ({dramas_in_active_year} dramas released)")

# Correlation between Rating and Number of Episodes
correlation = df[['Rating', 'Number of Episodes']].corr().iloc[0,1]
print(f"Correlation Between Rating and Number of Episodes: {correlation:.2f}")
