# LinkedGen LinkedIn Post Dataset - Exploratory Data Analysis (EDA)

This notebook provides an exploratory data analysis (EDA) of the processed LinkedIn post dataset. We analyze the train and validation splits, visualize distributions, and explore key patterns in the data.

## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plotting style
sns.set(style='whitegrid')
%matplotlib inline

## 2. Load Train and Validation Data

In [None]:
# Define data paths
train_path = os.path.join('..', 'data', 'train.csv')
val_path = os.path.join('..', 'data', 'val.csv')

# Load datasets
train_df = pd.read_csv(train_path, names=["text", "category", "tone"], header=None)
val_df = pd.read_csv(val_path, names=["text", "category", "tone"], header=None)

print('Train set:')
display(train_df.head())
print('Validation set:')
display(val_df.head())

## 3. Overview of Dataset Structure

In [None]:
# Train set overview
print('Train set shape:', train_df.shape)
print('Train columns:', train_df.columns.tolist())
print('Train missing values:')
display(train_df.isnull().sum())

# Validation set overview
print('Validation set shape:', val_df.shape)
print('Validation columns:', val_df.columns.tolist())
print('Validation missing values:')
display(val_df.isnull().sum())

## 4. Distribution of Categories

In [None]:
# Plot category distribution for train set
plt.figure(figsize=(8,4))
sns.countplot(data=train_df, x='category', order=train_df['category'].value_counts().index)
plt.title('Category Distribution (Train)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot category distribution for validation set
plt.figure(figsize=(8,4))
sns.countplot(data=val_df, x='category', order=val_df['category'].value_counts().index)
plt.title('Category Distribution (Validation)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 5. Distribution of Tone

In [None]:
# Plot tone distribution for train set
plt.figure(figsize=(8,4))
sns.countplot(data=train_df, x='tone', order=train_df['tone'].value_counts().index)
plt.title('Tone Distribution (Train)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Plot tone distribution for validation set
plt.figure(figsize=(8,4))
sns.countplot(data=val_df, x='tone', order=val_df['tone'].value_counts().index)
plt.title('Tone Distribution (Validation)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 6. Text Length Analysis

In [None]:
# Compute text lengths (number of words and characters)
train_df['text_word_count'] = train_df['text'].apply(lambda x: len(str(x).split()))
train_df['text_char_count'] = train_df['text'].apply(lambda x: len(str(x)))

# Plot word count distribution
plt.figure(figsize=(8,4))
sns.histplot(train_df['text_word_count'], bins=30, kde=True)
plt.title('Text Word Count Distribution (Train)')
plt.xlabel('Number of Words')
plt.tight_layout()
plt.show()

# Plot character count distribution
plt.figure(figsize=(8,4))
sns.histplot(train_df['text_char_count'], bins=30, kde=True)
plt.title('Text Character Count Distribution (Train)')
plt.xlabel('Number of Characters')
plt.tight_layout()
plt.show()

## 7. Category vs. Tone Crosstab

In [None]:
# Create crosstab of category vs. tone
crosstab = pd.crosstab(train_df['category'], train_df['tone'])
plt.figure(figsize=(10,6))
sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues')
plt.title('Category vs. Tone Crosstab (Train)')
plt.ylabel('Category')
plt.xlabel('Tone')
plt.tight_layout()
plt.show()

## 8. Save Figures to Output Pane

In [None]:
# Example: Save last figure (heatmap) to file and display
output_dir = '../eda/figures'
os.makedirs(output_dir, exist_ok=True)
fig_path = os.path.join(output_dir, 'category_vs_tone_heatmap.png')
plt.figure(figsize=(10,6))
sns.heatmap(crosstab, annot=True, fmt='d', cmap='Blues')
plt.title('Category vs. Tone Crosstab (Train)')
plt.ylabel('Category')
plt.xlabel('Tone')
plt.tight_layout()
plt.savefig(fig_path)
plt.show()
print(f'Figure saved to {fig_path}')

## Based on the performances of the EDA, we find that the dataset is `well structured and organized`. So with this we are ready to move onto the model training.