In [None]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# ****Data Loading

In [None]:
df = pd.read_csv('/kaggle/input/top-200-spotify-songs-dataset/Spotify_Dataset_V3.csv', sep=';',encoding='utf-8')
df.head(10)


# ****Data Info

In [None]:
df.isnull().sum()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
corel = df.corr()
corel

# ****Exploratory Data Analysis

# Correlation of Features

In [None]:
sns.heatmap(corel,cmap='plasma',annot=True,fmt='.2f')
plt.show()

sns.clustermap(corel,cmap='plasma',annot=True,fmt='.2f')
plt.show()

# Selected numerical features

In [None]:
sns.pairplot(data=df[['Danceability', 'Energy', 'Loudness', 'Valence', 'Points (Total)']])
plt.suptitle('Pairplot of Numerical Features', y=1.02)
plt.show()

# Distribution of numerical features

In [None]:
num_cols = ['Danceability', 'Energy', 'Loudness', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Valence']
df[num_cols].hist(bins=20, figsize=(12, 10))
plt.suptitle('Distribution of Numerical Features', y=1.02)
plt.show()

# Distribution of Rank

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Rank'], bins=50, kde=True)
plt.title('Distribution of Rank')
plt.xlabel('Rank')
plt.ylabel('Frequency')
plt.show()

# Scatter plot between Energy and Danceability

In [None]:
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Energy', y='Danceability', alpha=0.5)
plt.title('Scatter plot: Energy vs Danceability')
plt.xlabel('Energy')
plt.ylabel('Danceability')
plt.show()


# Boxplot of Points (Total) by Continent

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Continent', y='Points (Total)')
plt.title('Boxplot: Points (Total) by Continent')
plt.xlabel('Continent')
plt.ylabel('Points (Total)')
plt.xticks(rotation=45)
plt.show()

# Count plot of Nationalities

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Nationality', order=df['Nationality'].value_counts().index)
plt.title('Count plot: Nationalities')
plt.xlabel('Nationality')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

plt.figure(figsize=(12, 6))
sns.boxplot(data=df, x='Nationality', y='Points (Total)')
plt.title('Nationality Distribution and Points')
plt.xlabel('Nationality')
plt.ylabel('Total Points')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()

# Loudness respected to Energy

In [None]:
sns.scatterplot(data=df,x='Loudness',y='Energy')
plt.title('Loudness vs Energy')
plt.xlabel('Energy')
plt.ylabel('Loudness')
plt.show()

# Violin plot of Valence by Nationality

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(data=df, x='Nationality', y='Valence', order=df['Nationality'].value_counts().index[:5])
plt.title('Violin plot: Valence by Nationality')
plt.xlabel('Nationality')
plt.ylabel('Valence')
plt.xticks(rotation=45)
plt.show()

# Distribution of Speechiness

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['Speechiness'], bins=30, kde=True)
plt.title('Distribution of Speechiness')
plt.xlabel('Speechiness')
plt.ylabel('Frequency')
plt.show()

# Distribution of Danceability by Continent

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Continent', y='Danceability')
plt.title('Boxplot: Danceability by Continent')
plt.xlabel('Continent')
plt.ylabel('Danceability')
plt.xticks(rotation=45)
plt.show()

# Count plot of Artists

In [None]:
plt.figure(figsize=(12, 18))
sns.countplot(data=df, x='Artists', order=df['Artists'].value_counts().index[:20])
plt.title('Top 10 Artists by Count')
plt.xlabel('Artists')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

# Song characterstics Distribution

In [None]:
song_characteristics = df[['Loudness', 'Speechiness', 'Valence']]

# Create histograms or KDE plots
plt.figure(figsize=(12, 8))

# Histograms
plt.subplot(2, 2, 1)
sns.histplot(data=song_characteristics, x='Loudness', bins=20, kde=True)
plt.title('Distribution of Loudness')

plt.subplot(2, 2, 2)
sns.histplot(data=song_characteristics, x='Speechiness', bins=20, kde=True)
plt.title('Distribution of Speechiness')

plt.subplot(2, 2, 3)
sns.histplot(data=song_characteristics, x='Valence', bins=20, kde=True)
plt.title('Distribution of Valence')

plt.tight_layout()
plt.show()

# Top 10 Artists by Count

In [None]:
top_artists = df['Artists'].value_counts().head(10)
plt.figure(figsize=(10, 6))
sns.barplot(x=top_artists.index, y=top_artists.values)
plt.title('Top 10 Artists by Count')
plt.xlabel('Artists')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()

# Artist Counts by Nationality

In [None]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df, x='Nationality', hue='Continent')
plt.title('Artist Counts by Nationality and Continent')
plt.xlabel('Nationality')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.legend(title='Continent')
plt.show()


# Artist Counts by Points Range

In [None]:
plt.figure(figsize=(10, 6))
df['Points Range'] = pd.cut(df['Points (Total)'], bins=[0, 500, 1000, 1500, 2000, 2500], labels=['0-500', '501-1000', '1001-1500', '1501-2000', '2001-2500'])
sns.countplot(data=df, x='Points Range', hue='Continent')
plt.title('Artist Counts by Points Range and Continent')
plt.xlabel('Points Range')
plt.ylabel('Count')
plt.legend(title='Continent')
plt.show()

# Artist Counts by Valence Range

In [None]:
plt.figure(figsize=(10, 6))
df['Valence Range'] = pd.cut(df['Valence'], bins=[0, 0.2, 0.4, 0.6, 0.8, 1.0], labels=['0-0.2', '0.2-0.4', '0.4-0.6', '0.6-0.8', '0.8-1.0'])
sns.countplot(data=df, x='Valence Range', hue='Continent')
plt.title('Artist Counts by Valence Range and Continent')
plt.xlabel('Valence Range')
plt.ylabel('Count')
plt.legend(title='Continent')
plt.show()

# Distribution of Points (Total) by Continent

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Continent', y='Points (Total)')
plt.title('Boxplot: Points (Total) by Continent')
plt.xlabel('Continent')
plt.ylabel('Points (Total)')
plt.xticks(rotation=45)
plt.show()

# Distribution of Rank

In [None]:
plt.figure(figsize=(8, 6))
sns.histplot(df['Rank'], bins=30, kde=True)
plt.title('Distribution of Rank')
plt.xlabel('Rank')
plt.ylabel('Frequency')
plt.show()

# Song Characteristics Over Time 

In [None]:
df['Date'] = pd.to_datetime(df['Date'])

# Select columns for analysis
time_characteristics_cols = ['Energy', 'Valence']  # Add more if needed

# Create Line Plot or Area Plot
plt.figure(figsize=(12, 6))
sns.lineplot(data=df, x='Date', y=time_characteristics_cols[0], label=time_characteristics_cols[0])
sns.lineplot(data=df, x='Date', y=time_characteristics_cols[1], label=time_characteristics_cols[1])
plt.title('Song Characteristics Over Time')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()