In [None]:
import pandas as pd
import json
import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [10, 5]

In [None]:
df = pd.read_csv('chordify_with_rn_200_1991-01-05_to_2022-10-01.csv', parse_dates=['date'])
df['year'] = df['date'].dt.year # Add year
# 'roman_numerals' column is a JSON string. Turn the values into lists.
df.roman_numerals = df.roman_numerals.apply(json.loads)
print('Total songs: ', len(df))
# Remove any `None` values (which is the value when no RN could be found for the chord).
df = df[df['roman_numerals'].apply(lambda x: None not in x)]
print('Songs with all roman numerals resolved (used for analysis): ', len(df))
# Create our final main dataframe with one row per-chord-per-song.
df = df.explode('roman_numerals')

In [None]:
# Check some summary stats.
print('Total number of songs: {:,}'.format(len(df.groupby(['artist', 'song']).count())))
print('Total number of chord instances: {:,}'.format(len(df)))
print('Average chord instances per song: {:.3f}'.format(df.groupby(['artist', 'song']).roman_numerals.count().mean()))
print('Average unique chords per song: {:.3f}'.format(df.groupby(['artist', 'song']).roman_numerals.nunique().mean()))
# Show all unique roman numeral chords in our dataset.
print('{} unique chords:'.format(len(df.roman_numerals.unique())))
print(', '.join(sorted(df.roman_numerals.unique())))

In [None]:
df.groupby(['artist', 'song']).roman_numerals.nunique().hist()
plt.title('Distribution of unique chord count')
plt.xlabel('Number of unique chords')
_ = plt.ylabel('Number of songs')

In [None]:
# Histogram of RN counts across the full dataset:
_ = df.roman_numerals.value_counts().plot.bar(title='Chord counts by type', width=0.9)

In [None]:
# Same, but only the top 10 most popular chords:
_ = df.roman_numerals.value_counts().nlargest(10).plot.bar(title='Chord counts by type (Top 10)', width=0.9)

In [None]:
df.groupby('year').roman_numerals.value_counts().groupby('year').head(6).unstack()\
    .plot.bar(stacked=True, title='Six most popular chords for each year', ylabel='Chord count')
_ = plt.legend(bbox_to_anchor=(1, 1)) # Move legend outside plot