# EDA for Paired Dataset (paired_affirmations.csv)

# Imports

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud
from transformers import GPT2TokenizerFast
import re

## Load and Preview Data

In [None]:
df = pd.read_csv("paired_affirmations.csv")
df.head()
df.info()

## Check for Missing or Duplicated Values

In [None]:
df.isnull.sum()
df.duplicated().sum()

## Basic Text Stats

In [None]:
df['input_len'] = df['Input'].astype(str).apply(len)
df['output_len'] = df['Output'].astype(str).apply(len)

df[['input_len', 'output_len']].describe()

## Visualization

In [None]:
sns.histplot(df['input_len'], kde=True, bins=30).set(title="Input Length Distribution")
plt.savefig("../results/eda/input_length_distribution.png", bbox_inches='tight')
plt.show()

sns.histplot(df['output_len'], kde=True, bins=30).set(title="Output Length Distribution")
plt.savefig("../results/eda/output_length_distribution.png", bbox_inches='tight')
plt.show()

## Tag Distrbiutions

In [None]:
df['Emotion_Label'].value_counts().plot(kind='bar', title="Emotion Label Distribution")
plt.savefig("../results/eda/emotion_label_distribution.png", bbox_inches='tight')
plt.show()

df['Affirmation_Tag'].value_counts().plot(kind='bar', title"Affirmation Tag Distribution")
plt.savefig("../results/eda/affirmation_tag_distribution.png", bbox_inches='tight')
plt.show()

## Most Common Words

In [None]:
all_inputs = " ".join(df['Input'].astype(str).tolist())
input_wc = WordCloud(width=800, height=400, background_color='white').generate(all_inputs)

plt.imshow(input_wc, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud - Input Texts")
plt.savefig("../results/eda/wordCloud_input_texts.png", bbox_inches='tight')
plt.show()



all_inputs = " ".join(df['Output'].astype(str).tolist())
input_wc = WordCloud(width=800, height=400, background_color='white').generate(all_inputs)

plt.imshow(input_wc, interpolation='bilinear')
plt.axis("off")
plt.title("WordCloud - Output Texts")
plt.savefig("../results/eda/wordCloud_output_texts.png", bbox_inches='tight')
plt.show()

## Tag vs. Input/Output Length

In [None]:
sns.boxplot(data=df, x='Affirmation_Tag', y='input_len')
plt.title("Input Length by Affirmation Tag")
plt.xticks(rotation=45)
plt.savefig("../results/eda/input_length_by_affirmation_tag.png", bbox_inches='tight')
plt.show()

## Distribution of Combined Lengths

In [None]:
df['total_len'] = df['input_len'] + df['output_len']
sns.histplot(df['total_len'], bins=30, kde=True)
plt.title("Total Input + Output Length Distribution")
plt.savefig("../results/eda/total_input_output_length_distribution.png", bbox_inches='tight')
plt.show()

## Heatmap of Emotions vs. Affirmation_Tag

In [None]:
heatmap_data = pd.crosstab(df['Emotion_Label'], df['Affirmation_Tag'])
sns.heatmap(heatmap_dadta, annot=True, fmt='d', cmap="Y1GnBu")
plt.title("Emotion vs. Affirmation Tag Frequency")
plt.savefig("../results/eda/emotion_vs_tag_heatmap.png", bbox_inches='tight')
plt.show()

## Length Threshold Coverage
How many pairs exceed 256 or 512 tokens (important for GPT2)?

In [None]:
df['total_len'] = df['input_len'] + df['output_len']
print("Pairs >256 characters:", (df['total_len'] > 256).sum())
print("Pairs >512 characters:", (df['total_len'] > 512).sum())

## Token Length Analysis (GPT-2 Limit Awareness)

In [None]:
tokenizer = GPT2TokenizerFast.from_prerained("gpt2")

df['num_tokens_input'] = df['Input'].apply(lambda x: len(tokenizer.encode(x)))
df['num_tokens_output'] = df['Output'].apply(lambda x: len(tokenizer.encode(x)))
df['total_tokens'] = df['num_tokens_input'] + df['num_tokens_output']

# Histogram of total tokens
sns.histplot(df['total_tokens'], bins=30, kde=True)
plt.title("Total Token Count per Input+Output Pair")
plt.savefig("../results/eda/total_token_distribution.png", bbox_inches='tight')
plt.show()

print("Pairs over 512 tokens: ", (df['total_tokens'] > 512)sum())
print("Pairs over 1024 tokens (GPT-2 limit): ", (df['total_tokens'] > 1024).sum())