# Machine Learning - Assignment 2
# Section A: Data Exploration & Visualization

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# Set style for better visualizations
plt.style.use('seaborn')
sns.set_palette('husl')

In [None]:
# Load the dataset
df = pd.read_csv("../data/Spotify_Youtube.csv")

# Display basic information
print("Dataset Shape:", df.shape)
print("\nFirst few rows:")
display(df.head())

In [None]:
# Drop unused columns
cols_to_drop = ['Unnamed: 0', 'Url_spotify', 'Uri', 'Url_youtube', 'Title', 'Description']
df.drop(columns=cols_to_drop, inplace=True)

# Combine 'compilation' into 'album'
df['Album_type'] = df['Album_type'].replace({'compilation': 'album'})

# Basic statistics summary
print("Basic Statistics:")
display(df.describe())

## Distribution Analysis

In [None]:
# Histogram of Danceability
plt.figure(figsize=(8, 5))
sns.histplot(data=df, x='Danceability', bins=30, kde=True, hue='Album_type')
plt.title("Distribution of Danceability by Album Type")
plt.xlabel("Danceability")
plt.ylabel("Count")
plt.legend(title='Album Type')
plt.tight_layout()
plt.show()

In [None]:
# Bar Plot: Average Views by Album Type
avg_views = df.groupby('Album_type')['Views'].mean().sort_values()
avg_views.plot(kind='bar', title='Average YouTube Views by Album Type')
plt.ylabel("Average Views")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Box Plot: Energy by Album Type
plt.figure(figsize=(8, 5))
sns.boxplot(x='Album_type', y='Energy', data=df)
plt.title("Distribution of Energy by Album Type")
plt.xlabel("Album Type")
plt.ylabel("Energy")
plt.tight_layout()
plt.show()

In [None]:
# Scatter Plot: Valence vs Danceability
fig = px.scatter(df, x='Valence', y='Danceability', color='Album_type', 
                 title='Valence vs Danceability by Album Type', opacity=0.6)
fig.show()

In [None]:
# Pie Chart: Distribution of Album Types
album_counts = df['Album_type'].value_counts()
album_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, title='Album Type Distribution')
plt.ylabel('')
plt.tight_layout()
plt.show()

In [None]:
# Save processed dataset for later steps
df.to_csv("../output/Processed_Spotify_Youtube.csv", index=False)