In [1]:
# import packages
import pandas as pd
import numpy as np

In [11]:
# import data
spotify = pd.read_csv("../data/processed/spotify_num_one_track_data.csv")
billboard = pd.read_csv("../data/processed/billboard_charts_num_one.csv")

## Are songs shorter now than they used to be?

We are going to explore the above question in our demo!

In [12]:
## first, let us quickly join the data
billboard = billboard.merge(spotify[["title", 
                                     "performer",
                                     "duration_ms",
                                     "popularity"]])

In [16]:
## add a year column
## note: technically data doesn't always line up with chart year... going to just go by chart year
billboard["year"] = billboard["chart_week"].str.extract("^(\d{4})")

## also add decade
billboard["decade"] = billboard["chart_week"].str.extract("^(\d{3})", expand=False) + "0s"

In [53]:
## groupby decade and find some diff measures of length
decade_group = (billboard
                    .groupby("decade", as_index=False)
                    ["duration_ms"].median())

In [54]:
# let us adjust ms to minutes
decade_group["median_min"] = (decade_group["duration_ms"]/1000/60)
decade_group["median_min_part"] = np.floor(decade_group["median_min"])
decade_group["median_sec_part"] = round((decade_group["median_min"] - decade_group["median_min_part"])*60, 0)

In [55]:
decade_group

Unnamed: 0,decade,duration_ms,median_min,median_min_part,median_sec_part
0,1960s,163280.0,2.721333,2.0,43.0
1,1970s,225666.0,3.7611,3.0,46.0
2,1980s,249606.5,4.160108,4.0,10.0
3,1990s,259853.0,4.330883,4.0,20.0
4,2000s,235213.0,3.920217,3.0,55.0
5,2010s,220734.0,3.6789,3.0,41.0
6,2020s,194735.0,3.245583,3.0,15.0


Say we only want it to count once per decade...

In [56]:
decade_group_unique = (billboard[["decade", "duration_ms", "title", "performer"]].drop_duplicates()
                        .groupby("decade", as_index=False)
                        ["duration_ms"].median())

In [70]:
# let us adjust ms to minutes
decade_group_unique["median_min"] = (decade_group_unique["duration_ms"]/1000/60)
decade_group_unique["median_min_part"] = np.floor(decade_group_unique["median_min"])
decade_group_unique["median_sec_part"] = round((decade_group_unique["median_min"] - decade_group_unique["median_min_part"])*60, 0)
decade_group_unique["display"] = decade_group_unique["median_min_part"].astype(int).astype(str) + ":" + decade_group_unique["median_sec_part"].astype(int).astype(str)

In [71]:
decade_group_unique

Unnamed: 0,decade,duration_ms,median_min,median_min_part,median_sec_part,display
0,1960s,163280.0,2.721333,2.0,43.0,2:43
1,1970s,221373.0,3.68955,3.0,41.0,3:41
2,1980s,253402.5,4.223375,4.0,13.0,4:13
3,1990s,262826.0,4.380433,4.0,23.0,4:23
4,2000s,233926.0,3.898767,3.0,54.0,3:54
5,2010s,217795.5,3.629925,3.0,38.0,3:38
6,2020s,200645.0,3.344083,3.0,21.0,3:21


Is this even more pronounced in recent years?

In [65]:
year_group_unique = (billboard[["year", "duration_ms", "title", "performer"]].drop_duplicates()
                        .groupby("year", as_index=False)
                        ["duration_ms"].median())

year_group_unique["median_min"] = (year_group_unique["duration_ms"]/1000/60)
year_group_unique["median_min_part"] = np.floor(year_group_unique["median_min"])
year_group_unique["median_sec_part"] = round((year_group_unique["median_min"] - year_group_unique["median_min_part"])*60, 0)

In [66]:
year_group_unique[year_group_unique["year"]>="2010"]

Unnamed: 0,year,duration_ms,median_min,median_min_part,median_sec_part
50,2010,227741.0,3.795683,3.0,48.0
51,2011,228128.5,3.802142,3.0,48.0
52,2012,224653.0,3.744217,3.0,45.0
53,2013,223546.0,3.725767,3.0,44.0
54,2014,228333.0,3.80555,3.0,48.0
55,2015,213520.0,3.558667,3.0,34.0
56,2016,221700.0,3.695,3.0,42.0
57,2017,215424.5,3.590408,3.0,35.0
58,2018,209033.0,3.483883,3.0,29.0
59,2019,192443.0,3.207383,3.0,12.0


In [72]:
year_group_unique.to_csv("../data/findings/year_2010_2025.csv", index=False)
decade_group_unique.to_csv("../data/findings/decade.csv", index=False)