# Data Analyzer and Visualizer
## Books.toscrape.com ETL Project - TTTC3213

**Visualizations:**
1. Before/After data cleaning comparison
2. Price distribution by category
3. Rating distribution
4. Best value books analysis (project goal)
5. Category analysis

## 1. Import Dependencies

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import numpy as np

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette("husl")

## 2. Load Data

In [None]:
raw_path = os.path.join(os.getcwd(), '..', 'data', 'raw_books.csv')
cleaned_path = os.path.join(os.getcwd(), '..', 'data', 'cleaned_books.csv')

df_raw = pd.read_csv(os.path.abspath(raw_path))
df_clean = pd.read_csv(os.path.abspath(cleaned_path))

print(f"Loaded raw data: {len(df_raw)} records")
print(f"Loaded cleaned data: {len(df_clean)} records")

viz_dir = os.path.join(os.getcwd(), '..', 'visualizations')
viz_dir = os.path.abspath(viz_dir)
os.makedirs(viz_dir, exist_ok=True)

## 3. Visualization 1: Before/After Cleaning Comparison

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Before vs After Data Cleaning Comparison', fontsize=16, fontweight='bold')

ax1 = axes[0, 0]
colors = ['#ff6b6b', '#4ecdc4']
ax1.bar([0], [1], color=colors[0], label='Before', width=0.35)
ax1.bar([0.35], [1], color=colors[1], label='After', width=0.35)
ax1.set_xticks([0.175])
ax1.set_xticklabels(['Price Data Type'])
ax1.set_title('Price Data Transformation')
ax1.text(0, 0.5, 'String\n"£51.77"', ha='center', va='center', fontsize=10, color='white', fontweight='bold')
ax1.text(0.35, 0.5, 'Float\n51.77', ha='center', va='center', fontsize=10, color='white', fontweight='bold')
ax1.legend()

ax2 = axes[0, 1]
categories = ['Prices', 'Descriptions', 'Categories']
x = np.arange(len(categories))
ax2.bar(x - 0.175, [0, 1, 0], 0.35, label='Before Cleaning', color='#ff6b6b')
ax2.bar(x + 0.175, [0, 0, 0], 0.35, label='After Cleaning', color='#4ecdc4')
ax2.set_xticks(x)
ax2.set_xticklabels(categories)
ax2.set_title('Missing Values Before vs After')
ax2.legend()

ax3 = axes[1, 0]
ax3.pie([8, 1], labels=['object', 'int64'], autopct='%1.1f%%', colors=['#ff6b6b', '#45b7d1'])
ax3.set_title('Data Types Before Cleaning')

ax4 = axes[1, 1]
ax4.pie([9, 2, 2, 1], labels=['object', 'float64', 'int64', 'bool'], 
        autopct='%1.1f%%', colors=['#4ecdc4', '#45b7d1', '#96ceb4', '#ffeaa7'])
ax4.set_title('Data Types After Cleaning')

plt.tight_layout()
plt.savefig(os.path.join(viz_dir, '01_before_after_cleaning.png'), dpi=150, bbox_inches='tight')
plt.show()

## 4. Visualization 2: Price Distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Price Distribution Analysis', fontsize=16, fontweight='bold')

ax1 = axes[0, 0]
ax1.hist(df_clean['price_clean'], bins=20, edgecolor='black', color='#4ecdc4', alpha=0.7)
ax1.axvline(df_clean['price_clean'].mean(), color='red', linestyle='--', label=f'Mean: £{df_clean["price_clean"].mean():.2f}')
ax1.axvline(df_clean['price_clean'].median(), color='orange', linestyle='--', label=f'Median: £{df_clean["price_clean"].median():.2f}')
ax1.set_xlabel('Price (£)')
ax1.set_ylabel('Number of Books')
ax1.set_title('Overall Price Distribution')
ax1.legend()

ax2 = axes[0, 1]
top_categories = df_clean['category_clean'].value_counts().head(10).index.tolist()
df_top = df_clean[df_clean['category_clean'].isin(top_categories)]
category_order = df_top.groupby('category_clean')['price_clean'].median().sort_values().index
sns.boxplot(data=df_top, x='category_clean', y='price_clean', order=category_order, ax=ax2, palette='viridis')
ax2.set_xlabel('Category')
ax2.set_ylabel('Price (£)')
ax2.set_title('Price by Top 10 Categories')
ax2.tick_params(axis='x', rotation=45)

ax3 = axes[1, 0]
price_cat_counts = df_clean['price_category'].value_counts()
ax3.pie(price_cat_counts.values, labels=price_cat_counts.index, autopct='%1.1f%%', 
        colors=['#4ecdc4', '#45b7d1', '#ff6b6b', '#f7dc6f'], startangle=90)
ax3.set_title('Books by Price Category')

ax4 = axes[1, 1]
avg_prices = df_clean.groupby('category_clean')['price_clean'].mean().sort_values(ascending=True).tail(10)
avg_prices.plot(kind='barh', ax=ax4, color='#4ecdc4', edgecolor='black')
ax4.set_xlabel('Average Price (£)')
ax4.set_title('Top 10 Most Expensive Categories')

plt.tight_layout()
plt.savefig(os.path.join(viz_dir, '02_price_distribution.png'), dpi=150, bbox_inches='tight')
plt.show()

## 5. Visualization 3: Rating Distribution

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 10))
fig.suptitle('Rating Distribution Analysis', fontsize=16, fontweight='bold')

ax1 = axes[0, 0]
rating_counts = df_clean['rating'].value_counts().sort_index()
colors = ['#ff6b6b', '#ffa07a', '#ffd93d', '#6bcb77', '#4ecdc4']
bars = ax1.bar(rating_counts.index, rating_counts.values, color=colors, edgecolor='black')
ax1.set_xlabel('Rating (Stars)')
ax1.set_ylabel('Number of Books')
ax1.set_title('Distribution of Book Ratings')
ax1.set_xticks([1, 2, 3, 4, 5])

ax2 = axes[0, 1]
top_categories = df_clean['category_clean'].value_counts().head(8).index.tolist()
df_top = df_clean[df_clean['category_clean'].isin(top_categories)]
avg_rating_by_cat = df_top.groupby('category_clean')['rating'].mean().sort_values(ascending=True)
avg_rating_by_cat.plot(kind='barh', ax=ax2, color=plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(avg_rating_by_cat))), edgecolor='black')
ax2.set_xlabel('Average Rating')
ax2.set_title('Average Rating by Category')
ax2.set_xlim(0, 5)

ax3 = axes[1, 0]
rating_cat_counts = df_clean['rating_category'].value_counts()
order = ['High (4-5 stars)', 'Medium (3 stars)', 'Low (1-2 stars)']
ordered_counts = [rating_cat_counts.get(cat, 0) for cat in order]
ax3.pie(ordered_counts, labels=order, autopct='%1.1f%%', colors=['#6bcb77', '#ffd93d', '#ff6b6b'], startangle=90)
ax3.set_title('Books by Rating Category')

ax4 = axes[1, 1]
scatter = ax4.scatter(df_clean['price_clean'], df_clean['rating'], c=df_clean['rating'], cmap='RdYlGn', alpha=0.6, edgecolors='black', linewidth=0.5)
ax4.set_xlabel('Price (£)')
ax4.set_ylabel('Rating')
ax4.set_title('Price vs Rating')
plt.colorbar(scatter, ax=ax4, label='Rating')

plt.tight_layout()
plt.savefig(os.path.join(viz_dir, '03_rating_distribution.png'), dpi=150, bbox_inches='tight')
plt.show()

## 6. Visualization 4: Best Value Analysis (PROJECT GOAL)

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 12))
fig.suptitle('PROJECT GOAL: Best Value Books Analysis', fontsize=16, fontweight='bold')

ax1 = axes[0, 0]
ax1.hist(df_clean['value_score'], bins=25, edgecolor='black', color='#4ecdc4', alpha=0.7)
ax1.axvline(df_clean['value_score'].mean(), color='red', linestyle='--', label=f'Mean: {df_clean["value_score"].mean():.2f}')
ax1.set_xlabel('Value Score')
ax1.set_ylabel('Number of Books')
ax1.set_title('Value Score Distribution')
ax1.legend()

ax2 = axes[0, 1]
top_value = df_clean.nlargest(15, 'value_score')[['title_clean', 'value_score', 'rating', 'price_clean']]
top_value['title_short'] = top_value['title_clean'].apply(lambda x: x[:30] + '...' if len(x) > 30 else x)
colors = plt.cm.Greens(np.linspace(0.4, 0.9, len(top_value)))[::-1]
ax2.barh(top_value['title_short'], top_value['value_score'], color=colors, edgecolor='black')
ax2.set_xlabel('Value Score')
ax2.set_title('Top 15 Best Value Books')

ax3 = axes[1, 0]
top_categories = df_clean['category_clean'].value_counts().head(10).index.tolist()
df_top = df_clean[df_clean['category_clean'].isin(top_categories)]
avg_value_by_cat = df_top.groupby('category_clean')['value_score'].mean().sort_values(ascending=True)
avg_value_by_cat.plot(kind='barh', ax=ax3, color=plt.cm.RdYlGn(np.linspace(0.2, 0.8, len(avg_value_by_cat))), edgecolor='black')
ax3.set_xlabel('Average Value Score')
ax3.set_title('Best Value Categories')

ax4 = axes[1, 1]
avg_price = df_clean['price_clean'].mean()
avg_rating = df_clean['rating'].mean()
colors = []
for _, row in df_clean.iterrows():
    if row['rating'] >= avg_rating and row['price_clean'] <= avg_price: colors.append('#4ecdc4')
    elif row['rating'] >= avg_rating: colors.append('#45b7d1')
    elif row['price_clean'] <= avg_price: colors.append('#ffd93d')
    else: colors.append('#ff6b6b')
ax4.scatter(df_clean['price_clean'], df_clean['rating'], c=colors, alpha=0.6, edgecolors='black', linewidth=0.5)
ax4.axhline(avg_rating, color='gray', linestyle='--', alpha=0.7)
ax4.axvline(avg_price, color='gray', linestyle='--', alpha=0.7)
ax4.set_xlabel('Price (£)')
ax4.set_ylabel('Rating')
ax4.set_title('Value Quadrant Analysis')

plt.tight_layout()
plt.savefig(os.path.join(viz_dir, '04_best_value_analysis.png'), dpi=150, bbox_inches='tight')
plt.show()

## 7. Best Value Summary

In [None]:
print("Top 10 Best Value Books:")
top_10_value = df_clean.nlargest(10, 'value_score')[['title_clean', 'category_clean', 'price_clean', 'rating', 'value_score']]
display(top_10_value)

print("\nTop 5 Best Value Categories:")
best_cat = df_clean.groupby('category_clean')['value_score'].mean().sort_values(ascending=False).head(5)
for cat, score in best_cat.items():
    print(f"  - {cat}: {score:.2f}")

avg_price = df_clean['price_clean'].mean()
avg_rating = df_clean['rating'].mean()
best_value_count = len(df_clean[(df_clean['rating'] >= avg_rating) & (df_clean['price_clean'] <= avg_price)])
print(f"\nBooks in 'Best Value' quadrant: {best_value_count} ({best_value_count/len(df_clean)*100:.1f}%)")

## 8. Visualization 5: Category Analysis

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 6))
fig.suptitle('Category Analysis', fontsize=16, fontweight='bold')

ax1 = axes[0]
category_counts = df_clean['category_clean'].value_counts().head(15)
category_counts.plot(kind='barh', ax=ax1, color=plt.cm.viridis(np.linspace(0.2, 0.8, len(category_counts)))[::-1], edgecolor='black')
ax1.set_xlabel('Number of Books')
ax1.set_title('Top 15 Categories by Book Count')

ax2 = axes[1]
top_cats = df_clean['category_clean'].value_counts().head(8).index
df_top = df_clean[df_clean['category_clean'].isin(top_cats)]
summary = df_top.groupby('category_clean').agg({'price_clean': 'mean', 'rating': 'mean', 'value_score': 'mean'}).round(2)
ax2.axis('off')
table_data = [['Category', 'Avg Price (£)', 'Avg Rating', 'Value Score']]
for cat, row in summary.iterrows():
    table_data.append([cat[:20], f'£{row["price_clean"]:.2f}', f'{row["rating"]:.1f}', f'{row["value_score"]:.2f}'])
table = ax2.table(cellText=table_data, loc='center', cellLoc='center')
table.auto_set_font_size(False)
table.set_fontsize(10)
table.scale(1.2, 1.5)
for i in range(4):
    table[(0, i)].set_facecolor('#4ecdc4')
ax2.set_title('Category Statistics Summary')

plt.tight_layout()
plt.savefig(os.path.join(viz_dir, '05_category_analysis.png'), dpi=150, bbox_inches='tight')
plt.show()

print("\nAll visualizations generated successfully!")