In [19]:
import numpy as np 
import pandas as pd
import plotly.express as px

# This Notebook is an EDA for the Reasearch Question: <br> How did the Covid-19 pandemic affect global listening trends on Spotify?
## We read in our chart data and the COVID-19 data from the WHO, merge them together, and perform basic operations such as extracting the month and day from the date column or 'binning' the covid cases to visualize the data.

In [20]:
# Adding week/month from date
df = pd.read_csv('../data/raw_data/daily_global_charts/daily_combined/daily17-24.csv')
df["date"] = pd.to_datetime(df["date"])
df_grouped = df.groupby("date")["streams"].sum().reset_index()
df_grouped["month"] = df_grouped["date"].dt.month
df_grouped["month_name"] = df_grouped["date"].dt.strftime("%B")  
df_grouped["weekday"] = df_grouped["date"].dt.weekday
df_grouped["weekday_name"] = df_grouped["date"].dt.strftime("%A")  
df_grouped.head()

Unnamed: 0,date,streams,month,month_name,weekday,weekday_name
0,2017-01-01,148613167,1,January,6,Sunday
1,2017-01-02,154810836,1,January,0,Monday
2,2017-01-03,166239930,1,January,1,Tuesday
3,2017-01-04,169252507,1,January,2,Wednesday
4,2017-01-05,169919094,1,January,3,Thursday


In [21]:
# Grouping COVID-19 region data by date
covid = pd.read_csv('../data/WHO-COVID-19-global-daily-data.csv')
covid["Date_reported"] = pd.to_datetime(covid["Date_reported"])
covid_grouped = covid.groupby("Date_reported")["New_cases"].sum().reset_index()

In [22]:
# Merging the COVID-19 and chart data. / Creating bins for COVID-19 cases.
merged_df = pd.merge(covid_grouped, df_grouped, left_on="Date_reported", right_on="date", how="inner")
bin_edges = [-100000000000, 1000, 5000, 10000, 50000, 100000, 500000, float('inf')]
bin_labels = ["0-1000", "1000-5000", "5000-10000", "10000-50000", "50000-100000", "100000-500000", "500000+"]
merged_df['New_cases_binned'] = pd.cut(merged_df['New_cases'], bins=bin_edges, labels=bin_labels)

merged_df

Unnamed: 0,Date_reported,New_cases,date,streams,month,month_name,weekday,weekday_name,New_cases_binned
0,2020-01-04,3.0,2020-01-04,245851954,1,January,5,Saturday,0-1000
1,2020-01-05,-1.0,2020-01-05,226792570,1,January,6,Sunday,0-1000
2,2020-01-06,3.0,2020-01-06,248298082,1,January,0,Monday,0-1000
3,2020-01-07,0.0,2020-01-07,257397919,1,January,1,Tuesday,0-1000
4,2020-01-08,1.0,2020-01-08,259996737,1,January,2,Wednesday,0-1000
...,...,...,...,...,...,...,...,...,...
1819,2024-12-27,32.0,2024-12-27,370674168,12,December,4,Friday,0-1000
1820,2024-12-28,5.0,2024-12-28,363920830,12,December,5,Saturday,0-1000
1821,2024-12-29,971.0,2024-12-29,341091022,12,December,6,Sunday,0-1000
1822,2024-12-30,24253.0,2024-12-30,371292431,12,December,0,Monday,10000-50000


In [23]:
# Median daily streams per weekday / bar-plot
df_weekday = merged_df.groupby(["weekday", "weekday_name"])["streams"].median().reset_index()

df_weekday = df_weekday.sort_values(by="weekday")

fig = px.bar(df_weekday, 
             x="weekday_name", 
             y="streams", 
             title="Median Daily Streams Per Weekday", 
             labels={"weekday_name": "Weekday", "streams": "Median Streams"},
             text_auto=True,
             color="weekday_name",  
             color_discrete_sequence=px.colors.qualitative.Vivid 
)
fig.update_layout(
    template="plotly_dark",
    xaxis_title="Day of the Week",
    yaxis_title="Total Streams")

fig.show()

In [24]:
# Median daily streams per month / bar-plot
df_months = merged_df.groupby(["month", "month_name"])["streams"].median().reset_index()

df_months = df_months.sort_values(by="month")

fig = px.bar(df_months, x="month_name", y="streams", title="Median Daily Streams Per Month", 
             labels={"month_name": "Month", "streams": "Median Streams"}, text_auto=True,              color="month_name", 
             color_discrete_sequence=px.colors.qualitative.Vivid  )
fig.update_layout(template="plotly_dark")

fig.show()

In [25]:
# Streams distribution based on COVID-19 case intervalls / box-plot
fig = px.box(merged_df, 
             x="New_cases_binned", 
             y="streams", 
             title="Streams Distribution Based on COVID-19 Case Intervalls",
             labels={"New_cases_binned": "Intervalls of COVID-19 Cases", "streams": "Streams"},
             color="New_cases_binned",  
             color_discrete_sequence=px.colors.qualitative.Vivid,
             category_orders={"New_cases_binned": bin_labels})

fig.update_layout(
    xaxis_tickangle=-45,  
    template="plotly_dark", 
)

fig.show()


In [26]:
#Streams with weekly streams and COVID cases / scatter-plot
merged_df['Date_reported'] = pd.to_datetime(merged_df['Date_reported'])
merged_df['streams_7day_avg'] = merged_df['streams'].rolling(window=7).mean()

merged_df['Week'] = merged_df['Date_reported'].dt.to_period('W').dt.start_time

merged_df = merged_df[merged_df['New_cases'] >= 0]

merged_df['size_scaled'] = merged_df['New_cases'] + 500000  

fig = px.scatter(merged_df, 
                 x='Week', 
                 y='streams_7day_avg', 
                 size='size_scaled',  
                 color='New_cases', 
                 title="Streams With Weekly Streams and COVID Cases",
                 labels={"streams_7day_avg": "Weekly Average of Streams", "Week": "Week", "New_cases": "COVID Cases"},
                 color_continuous_scale=px.colors.sequential.YlOrRd)


fig.update_layout(
    template="plotly_dark",  
    xaxis_title="Week",
    yaxis_title="Weekly Average of Streams",
    legend_title="COVID cases",
    coloraxis=dict(cmin=0, cmax=500000) 
)

fig.show()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

