# 📊 Exploratory Data Analysis (EDA)
This notebook performs initial data analysis for the MovieLens dataset. It includes statistical summaries, visualizations, and filtering insights to guide recommender system design.

## 1. 📥 Load Data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('../data/ratings.csv')
df.head()

## 2. 📋 Basic Info and Summary

In [None]:
df.info()
df.describe()

## 3. ⭐ Rating Distribution

In [None]:
sns.histplot(df['rating'], bins=10, kde=False)
plt.title('Rating Distribution')
plt.xlabel('Rating')
plt.ylabel('Count')
plt.show()

## 4. 👤 Unique Users and Movies

In [None]:
print("Unique users:", df['userId'].nunique())
print("Unique movies:", df['movieId'].nunique())

## 5. 📈 Ratings per User Distribution

In [None]:
user_counts = df['userId'].value_counts()
sns.histplot(user_counts, bins=50)
plt.title('Number of Ratings per User')
plt.xlabel('Ratings Count')
plt.ylabel('Users')
plt.show()

## 6. 🎬 Ratings per Movie Distribution

In [None]:
movie_counts = df['movieId'].value_counts()
sns.histplot(movie_counts, bins=50)
plt.title('Number of Ratings per Movie')
plt.xlabel('Ratings Count')
plt.ylabel('Movies')
plt.show()

## 7. 🧹 Filtering Threshold Justification
Based on the distributions above, we will filter out users and movies with fewer than 10 ratings.

In [None]:
filtered_df = df[
    df['userId'].isin(user_counts[user_counts >= 10].index) &
    df['movieId'].isin(movie_counts[movie_counts >= 10].index)
]
filtered_df.shape