In [None]:
import os
import pandas as pd
import spotipy
import billboard
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.cache_handler import CacheFileHandler

spotify = spotipy.Spotify(client_credentials_manager=SpotifyClientCredentials(client_id=os.environ['SPOTIFY_CLIENT_ID'], client_secret=os.environ['SPOTIFY_CLIENT_SECRET'], cache_handler=CacheFileHandler(username='keatonconrad')))

In [None]:
import lyricsgenius
genius = lyricsgenius.Genius(os.environ['GENIUS_TOKEN'], verbose=False, remove_section_headers=True)

In [None]:
chart = billboard.ChartData('hot-100')

tracks_scraped = [] # Used to avoid searching for same song twice, since peak chart position won't change
track_positions = {}
weeks = 0

while chart.previousDate and weeks < 100:
    for song in chart:
        if song.title + ' ' + song.artist in tracks_scraped:
            continue
            
        artist = song.artist.replace('Featuring ', '').replace('feat. ', '').replace('feat ', '').replace('& ', '')
        search_results = spotify.search(q=song.title + ' ' + artist, limit=1, type='track', market='US')
        tracks_scraped.append(song.title + ' ' + song.artist)
        
        try:
            uri = search_results['tracks']['items'][0]['uri']
            track_positions[uri] = int(song.peakPos)
        except IndexError: # If search didn't return anything
            print(search_results)
            continue
            
    chart = billboard.ChartData('hot-100', chart.previousDate)
    weeks += 1

len(track_positions)

In [None]:
features = []
for track in track_positions.keys():
    features.append(spotify.audio_features(tracks=track)[0])

In [None]:
full_df = pd.DataFrame(features)
print(full_df.columns)
full_df.drop(['type', 'id', 'uri', 'track_href', 'analysis_url'], axis=1, inplace=True)
full_df['chart_position'] = track_positions.values()

In [None]:
full_df['polarity'] = polarity
full_df['subjectivity'] = subjectivity
full_df['lyrics'] = lyrics
full_df.head()

In [None]:
top_10_df = full_df.loc[full_df['chart_position'] <= 10]
top_10_df.describe()

In [None]:
bottom_90_df = full_df.loc[full_df['chart_position'] > 10]
bottom_90_df.describe()

In [None]:
for column in full_df.columns:
    t = ttest_ind(top_10_df[column].astype(float), bottom_90_df[column].astype(float))
    print(column + ' - T: ' + str(t[0]) + ', p: ' + str(t[1]))

In [None]:
import re
from textblob import TextBlob

polarity = []
subjectivity = []
lyrics = []

for i, song in full_df.iterrows():
    try:
        song = genius.search_song(song['song'], song['artist'])
        song_lyrics = re.sub(r'\[.*?\]\n', '', song.lyrics).replace('\n', ' ')
        blob = TextBlob(song_lyrics)
        polarity.append(blob.sentiment[0])
        subjectivity.append(blob.sentiment[1])
        lyrics.apppend(song_lyrics)
    except:
        polarity.append(None)
        subjectivity.append(None)
        lyrics.apppend(None)

full_df['polarity'] = polarity
full_df['subjectivity'] = subjectivity
full_df['lyrics'] = lyrics

In [None]:
import re
from textblob import TextBlob

song = genius.search_song('Nicotine', 'Keaton Conrad')
lyrics = re.sub(r'\[.*?\]\n', '', song.lyrics)

blob = TextBlob(lyrics)
print(blob.sentiment)