# 📊 Exploratory Data Analysis (EDA) - TikTok Claim Classification

This notebook explores the dataset used to classify TikTok video content as either **claim** or **opinion**. The goal is to understand feature distributions and relationships before building a machine learning model.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv('../data/sample_data.csv')
df.head()

## 🔍 Data Overview

In [None]:
df.info()

## 📈 Basic Statistics

In [None]:
df.describe()

## 🎯 Label Distribution

In [None]:
sns.countplot(data=df, x='label')
plt.title('Distribution of Labels')
plt.show()

## 📊 Feature Distributions

In [None]:
num_cols = ['likes', 'comments', 'shares', 'duration_sec']
df[num_cols].hist(bins=20, figsize=(12, 8))
plt.suptitle('Numerical Feature Distributions')
plt.show()

## 🔗 Correlation Heatmap

In [None]:
sns.heatmap(df[num_cols + ['contains_hashtag_claim', 'contains_fact_check_words']].corr(), annot=True, cmap='coolwarm')
plt.title('Correlation Heatmap')
plt.show()