In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<h1 align="center">Spotify Data Analysis using Python - Project</h1>

<h3 align="right">By - <a>K.Sai Saranya</a></h3>

## INTRODUCTION

Spotify is a Swedish audio streaming and media services provider founded in April 2006. It is the world's largest music streaming service provider and has over 381 million monthly active users, which also includes 172 million paid subscribers.

Here, We'll exploring and quantify data about music and drawing valuable insights.

Perform an exploratory data analysis (EDA) and data visualization project using data from Spotify using Python.

Data analysis exploring the relationship between the audio features of a song and how positive or negative its lyrics are, involving sentiment analysis.

Includes data collection script that scrapes audio feature data from the Spotify API, as well as lyrical data from the LyricWikiAPI

Spotify Data Analysis makes use of secondary data from Spotify. Learners will use data to identify patterns and relationships between different characteristics. The activity will support learners in developing their ability to review and interpret a dataset. The activity starts by encouraging learners to think of questions that the dataset might answer and thus develop their “Problem” skills. Once the dataset has been analysed learners will have the opportunity to reorganise and restructure data to help them answer their questions

## Importing Datasets

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Reading Data Set 
tracks = pd.read_csv('../input/spotify-datasets/tracks.csv')
genre = pd.read_csv('../input/ultimate-spotify-tracks-db/SpotifyFeatures.csv')

In [None]:
tracks.head()

In [None]:
genre.head()

## Spotify Tracks Dataset Analysis

In [None]:
# checking null values
pd.isnull(tracks).sum()

In [None]:
tracks.info()

### Descriptive statistics of Spotify tracks

In [None]:
tracks.describe().transpose()

### Top 10 most popular songs on Spotify

In [None]:
most = tracks.query('popularity > 90', inplace = False).sort_values('popularity', ascending = False)
most[:10]

### Top 10 least popular songs on Spotify

In [None]:
least = tracks.sort_values('popularity', ascending = True).head(10)
least

### Changing index to release date

In [None]:
tracks.set_index('release_date', inplace = True)
tracks.index=pd.to_datetime(tracks.index)
tracks.head()

### Check artist at 18th row of Spotify Tracks Dataset

In [None]:
tracks[['artists']].iloc[18]

### Converting duration from millisecond to second on Spotify Tracks Dataset

In [None]:
tracks['duration'] = tracks['duration_ms'].apply (lambda x : round(x/1000))
tracks.drop('duration_ms', inplace = True, axis=1)
tracks.duration.head()

### Correlation Heatmap between Variable using Pearson correlation method

In [None]:
cm = tracks.drop(['key','mode','explicit'], axis=1).corr(method = 'pearson')
plt.figure(figsize=(14,6))
map = sns.heatmap(cm, annot = True, fmt = '.1g', vmin=-1, vmax=1, center=0, cmap='inferno', linewidths=1, linecolor='Black')
map.set_title('Correlation Heatmap between Variable')
map.set_xticklabels(map.get_xticklabels(), rotation=90)

### Considering 0.4% of the total dataset to create Regression plots

In [None]:
sam = tracks.sample(int(0.004 * len(tracks)))
len(sam)

### Regression plot - Correlation between Loudness and Energy

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=sam, y='loudness', x='energy', color='c').set(title='Loudness vs Energy Correlation')

### Regression plot - Correlation between Popularity and Acousticness

In [None]:
plt.figure(figsize=(10,6))
sns.regplot(data=sam, y='popularity', x='acousticness', color='b').set(title='Popularity vs Acousticness Correlation')

### Creating new column in tracks dataset (Year, Release Date)

In [None]:
tracks['dates']=tracks.index.get_level_values('release_date')
tracks.dates=pd.to_datetime(tracks.dates)
years=tracks.dates.dt.year

In [None]:
tracks.head()

### Distibution plot - Visualize total number of songs on Spotify since 1992

In [None]:
sns.displot(years, discrete=True, aspect=2, height=5, kind='hist').set(title='Number of songs per year')

### Change in Duration of songs wrt Years

In [None]:
total_dr = tracks.duration
fig_dims = (18,7)
fig, ax = plt.subplots(figsize=fig_dims)
fig = sns.barplot(x = years, y = total_dr, ax = ax, errwidth = False).set(title='Years vs Duration')
plt.xticks(rotation=90)

## Spotify Features Dataset Analysis

### Duration of songs in different Genres

In [None]:
plt.title('Duration of songs in different Genres')
sns.color_palette('rocket', as_cmap=True)
sns.barplot(y='genre', x='duration_ms', data=genre)
plt.xlabel('Duration in milliseconds')
plt.ylabel('Genres')

### Top 5 Genres by Popularity

In [None]:
sns.set_style(style='darkgrid')
plt.figure(figsize=(10,5))
popular = genre.sort_values('popularity', ascending=False).head(10)
sns.barplot(y = 'genre', x = 'popularity', data = popular).set(title='Top 5 Genres by Popularity')