# **Data extraction**

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import style
from sklearn import linear_model
from statistics import mean


data_frame = pd.read_csv("../input/music-data-set/Music Data Set (1).csv")
data_frame.head()

Going to start off by analyzing my data

In [2]:
missing_val_count_by_column = (data_frame.isnull().sum())
print(missing_val_count_by_column[missing_val_count_by_column > 0])

 there are many missing values for the following attributes (song.year, song.hotttnesss, etc.) which might affect the best fitting line

In [3]:
sig=data_frame[['song.hotttnesss','song.duration','song.end_of_fade_in','song.loudness','song.start_of_fade_out','song.tempo']] 
sig.describe()

Here's a summary statistic

In [4]:
df1=data_frame[['song.hotttnesss']] 
df2=data_frame[['song.duration']] 
df3=data_frame[['song.end_of_fade_in']]
df4=data_frame[['song.loudness']]
df5=data_frame[['song.start_of_fade_out']]
df6=data_frame[['song.tempo']]

In [5]:
%matplotlib inline
plt.xlabel('Song Duration') #in seconds
plt.ylabel('Popularity') #scale from 0-1
plt.scatter(df2,df1,color='blue',marker='+')

No correlation between popularity and song duration, also the two variables lead to an error as its dataframes contain NaN / large null values

# **Song loudness and Tempo**

In [6]:
plt.xlabel('Loudness') # Decibels
plt.ylabel('Tempo') #in BPM
plt.scatter(df4,df6)

Fitting the data means I'm training the linear regression model using df4 & df6 datapoints

In [7]:
song_loudness = df4['song.loudness'].tolist()
song_tempo = df6['song.tempo'].tolist()

xs = np.array(song_loudness, dtype=np.float64)
ys = np.array(song_tempo, dtype=np.float64)

def best_fit_line(xs,ys):
    slope = (((mean(xs) * mean(ys)) - mean(xs*ys)) / ((mean(xs) * mean(xs)) - mean(xs*xs)))
    y_intercept = mean(ys) - slope * mean(xs)
    return slope, y_intercept

slope, y_intercept = best_fit_line(xs,ys)
regression_line = [(slope * x) + y_intercept for x in xs]

## Making Predictions

average_decibel = -10
average_tempo = (slope * average_decibel) + y_intercept

In [8]:
# Plot outputs

style.use('seaborn')
plt.scatter(xs,ys, label='Data Points', alpha=0.6, color='green',s=75)
plt.scatter(average_decibel, average_tempo, label = 'loudness-tempo prediction', color='red',s=100)
plt.plot(xs,regression_line, label = 'Best Fit Line', color='orange', linewidth=4)
plt.title('Loudness and Tempo linear regression')
plt.xlabel('Loudness (Decibel)')
plt.ylabel('Tempo (BPM)')
plt.legend()
plt.show()

print(average_tempo)

The red plot shows that if the loudness of the song is -10 in decibel then its associated tempo will be roughly around 123.5 BPM on average