In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import psycopg2
from sqlalchemy import create_engine
import re
from config import db_pwd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
# Import Spotify dataset
spotify_df = pd.read_csv('../Resources/Spotify_Data_1921-2020.csv')
spotify_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,['Carl Woitschach'],0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"['Robert Schumann', 'Vladimir Horowitz']",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,['Seweryn Goszczyński'],0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,['Francisco Canaro'],0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"['Frédéric Chopin', 'Vladimir Horowitz']",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [3]:
# Row count
len(spotify_df)

169909

In [4]:
# Test of artist formatting
test = "['Robert Schumann', 'Vladimir Horowitz']"
test_cleaner = test.replace("'", "").strip("[").strip("\]")
test_cleaner

'Robert Schumann, Vladimir Horowitz'

In [5]:
# Apply formatting to artists column
spotify_df['artists'] = spotify_df['artists'].str.replace("'", "").str.replace("[", "").str.replace("]", "")
spotify_df.head()

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year
0,0.995,Carl Woitschach,0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928
1,0.994,"Robert Schumann, Vladimir Horowitz",0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928
2,0.604,Seweryn Goszczyński,0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928
3,0.995,Francisco Canaro,0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928
4,0.99,"Frédéric Chopin, Vladimir Horowitz",0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928


In [6]:
# Concatenate artist column by comma
spotify_df=pd.concat([spotify_df,spotify_df.artists.str.split(',',expand=True)],1)

# keep only the top 3 artists
spotify_df = spotify_df.drop(spotify_df.columns[22:59], axis=1)

#rename the columns
spotify_df = spotify_df.rename(columns={0: 'artist', 1: 'artist_2', 2: 'artist_3'})

# drop columns artists, artist_2, artist_3
spotify_df=spotify_df.drop(columns=["artists", "artist_2", "artist_3"], axis=1)

spotify_df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,artist
0,0.995,0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,1928,0.0506,118.469,0.779,1928,Carl Woitschach
1,0.994,0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,1928,0.0462,83.972,0.0767,1928,Robert Schumann
2,0.604,0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,1928,0.929,107.177,0.88,1928,Seweryn Goszczyński
3,0.995,0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,1928-09-25,0.0926,108.003,0.72,1928,Francisco Canaro
4,0.99,0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,1928,0.0424,62.149,0.0693,1928,Frédéric Chopin


In [7]:
# drop unnecessary column "release_date"
spotify_df=spotify_df.drop(columns=["release_date"], axis=1)

In [8]:
# Import Genre dataset
genre_df = pd.read_csv('../Resources/data_w_genres.csv')
genre_df = genre_df[['artists', 'genres']]
genre_df = genre_df.rename(columns={'artists': 'genre_artists'})
genre_df.head()

Unnamed: 0,genre_artists,genres
0,"""Cats"" 1981 Original London Cast",['show tunes']
1,"""Cats"" 1983 Broadway Cast",[]
2,"""Fiddler On The Roof” Motion Picture Chorus",[]
3,"""Fiddler On The Roof” Motion Picture Orchestra",[]
4,"""Joseph And The Amazing Technicolor Dreamcoat""...",[]


In [9]:
genre_df.shape

(27621, 2)

In [10]:
# Merge the genres to the spotify data
spotify_df = spotify_df.merge(genre_df[['genre_artists', 'genres']], 
                              left_on='artist', 
                              right_on='genre_artists', 
                              how='left').drop('genre_artists',axis=1)
spotify_df.head()

Unnamed: 0,acousticness,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,speechiness,tempo,valence,year,artist,genres
0,0.995,0.708,158648,0.195,0,6KbQ3uYMLKb5jDxLF7wYDD,0.563,10,0.151,-12.428,1,Singende Bataillone 1. Teil,0,0.0506,118.469,0.779,1928,Carl Woitschach,[]
1,0.994,0.379,282133,0.0135,0,6KuQTIu1KoTTkLXKrwlLPV,0.901,8,0.0763,-28.454,1,"Fantasiestücke, Op. 111: Più tosto lento",0,0.0462,83.972,0.0767,1928,Robert Schumann,"['classical', 'early romantic era']"
2,0.604,0.749,104300,0.22,0,6L63VW0PibdM1HDSBoqnoM,0.0,5,0.119,-19.924,0,Chapter 1.18 - Zamek kaniowski,0,0.929,107.177,0.88,1928,Seweryn Goszczyński,[]
3,0.995,0.781,180760,0.13,0,6M94FkXd15sOAOQYRnWPN8,0.887,1,0.111,-14.734,0,Bebamos Juntos - Instrumental (Remasterizado),0,0.0926,108.003,0.72,1928,Francisco Canaro,"['tango', 'vintage tango']"
4,0.99,0.21,687733,0.204,0,6N6tiFZ9vLTSOIxkj8qKrd,0.908,11,0.098,-16.829,1,"Polonaise-Fantaisie in A-Flat Major, Op. 61",1,0.0424,62.149,0.0693,1928,Frédéric Chopin,"['classical', 'early romantic era', 'polish cl..."


In [11]:
# Reorder columns
spotify_df = spotify_df[['id', 
                         'artist',
                         'name',
                         'genres',
                         'popularity', 
                         'year',  
                         'duration_ms', 
                         'tempo', 
                         'key', 
                         'mode',
                         'acousticness', 
                         'instrumentalness', 
                         'danceability', 
                         'energy', 
                         'liveness',
                         'loudness',
                         'speechiness', 
                         'valence', 
                         'explicit'
                         ]]
spotify_df.head()

Unnamed: 0,id,artist,name,genres,popularity,year,duration_ms,tempo,key,mode,acousticness,instrumentalness,danceability,energy,liveness,loudness,speechiness,valence,explicit
0,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,Singende Bataillone 1. Teil,[],0,1928,158648,118.469,10,1,0.995,0.563,0.708,0.195,0.151,-12.428,0.0506,0.779,0
1,6KuQTIu1KoTTkLXKrwlLPV,Robert Schumann,"Fantasiestücke, Op. 111: Più tosto lento","['classical', 'early romantic era']",0,1928,282133,83.972,8,1,0.994,0.901,0.379,0.0135,0.0763,-28.454,0.0462,0.0767,0
2,6L63VW0PibdM1HDSBoqnoM,Seweryn Goszczyński,Chapter 1.18 - Zamek kaniowski,[],0,1928,104300,107.177,5,0,0.604,0.0,0.749,0.22,0.119,-19.924,0.929,0.88,0
3,6M94FkXd15sOAOQYRnWPN8,Francisco Canaro,Bebamos Juntos - Instrumental (Remasterizado),"['tango', 'vintage tango']",0,1928,180760,108.003,1,0,0.995,0.887,0.781,0.13,0.111,-14.734,0.0926,0.72,0
4,6N6tiFZ9vLTSOIxkj8qKrd,Frédéric Chopin,"Polonaise-Fantaisie in A-Flat Major, Op. 61","['classical', 'early romantic era', 'polish cl...",1,1928,687733,62.149,11,1,0.99,0.908,0.21,0.204,0.098,-16.829,0.0424,0.0693,0


In [12]:
# Apply formatting to genre column
spotify_df['genres'] = spotify_df['genres'].str.replace("'", "").str.replace("[", "").str.replace("]", "")
spotify_df.head()

Unnamed: 0,id,artist,name,genres,popularity,year,duration_ms,tempo,key,mode,acousticness,instrumentalness,danceability,energy,liveness,loudness,speechiness,valence,explicit
0,6KbQ3uYMLKb5jDxLF7wYDD,Carl Woitschach,Singende Bataillone 1. Teil,,0,1928,158648,118.469,10,1,0.995,0.563,0.708,0.195,0.151,-12.428,0.0506,0.779,0
1,6KuQTIu1KoTTkLXKrwlLPV,Robert Schumann,"Fantasiestücke, Op. 111: Più tosto lento","classical, early romantic era",0,1928,282133,83.972,8,1,0.994,0.901,0.379,0.0135,0.0763,-28.454,0.0462,0.0767,0
2,6L63VW0PibdM1HDSBoqnoM,Seweryn Goszczyński,Chapter 1.18 - Zamek kaniowski,,0,1928,104300,107.177,5,0,0.604,0.0,0.749,0.22,0.119,-19.924,0.929,0.88,0
3,6M94FkXd15sOAOQYRnWPN8,Francisco Canaro,Bebamos Juntos - Instrumental (Remasterizado),"tango, vintage tango",0,1928,180760,108.003,1,0,0.995,0.887,0.781,0.13,0.111,-14.734,0.0926,0.72,0
4,6N6tiFZ9vLTSOIxkj8qKrd,Frédéric Chopin,"Polonaise-Fantaisie in A-Flat Major, Op. 61","classical, early romantic era, polish classical",1,1928,687733,62.149,11,1,0.99,0.908,0.21,0.204,0.098,-16.829,0.0424,0.0693,0


In [13]:
#test = spotify_df[~spotify_df['genres'].str.contains('classical', na=False)]
# https://stackoverflow.com/questions/52297740/typeerror-bad-operand-type-for-unary-float/52297788

In [14]:
# Replace blank values in genre column with NaN
spotify_df['genres'].replace('', np.nan, inplace=True)
# https://stackoverflow.com/questions/29314033/drop-rows-containing-empty-cells-from-a-pandas-dataframe

# Drop the rows when genre = NaN
spotify_df = spotify_df.dropna(subset=['genres'])

spotify_df.head()

Unnamed: 0,id,artist,name,genres,popularity,year,duration_ms,tempo,key,mode,acousticness,instrumentalness,danceability,energy,liveness,loudness,speechiness,valence,explicit
1,6KuQTIu1KoTTkLXKrwlLPV,Robert Schumann,"Fantasiestücke, Op. 111: Più tosto lento","classical, early romantic era",0,1928,282133,83.972,8,1,0.994,0.901,0.379,0.0135,0.0763,-28.454,0.0462,0.0767,0
3,6M94FkXd15sOAOQYRnWPN8,Francisco Canaro,Bebamos Juntos - Instrumental (Remasterizado),"tango, vintage tango",0,1928,180760,108.003,1,0,0.995,0.887,0.781,0.13,0.111,-14.734,0.0926,0.72,0
4,6N6tiFZ9vLTSOIxkj8qKrd,Frédéric Chopin,"Polonaise-Fantaisie in A-Flat Major, Op. 61","classical, early romantic era, polish classical",1,1928,687733,62.149,11,1,0.99,0.908,0.21,0.204,0.098,-16.829,0.0424,0.0693,0
5,6NxAf7M8DNHOBTmEd3JSO5,Felix Mendelssohn,Scherzo a capriccio: Presto,"classical, early romantic era",0,1928,352600,63.521,6,0,0.995,0.911,0.424,0.12,0.0915,-19.242,0.0593,0.266,0
6,6O0puPuyrxPjDTHDUgsWI7,Franz Liszt,"Valse oubliée No. 1 in F-Sharp Major, S. 215/1","classical, late romantic era",0,1928,136627,80.495,11,1,0.956,0.435,0.444,0.197,0.0744,-17.226,0.04,0.305,0


In [15]:
spotify_df.shape

(151765, 19)

In [16]:
# Create a list of genres keywords to keep
keep_list = ['rap','rock', 'pop', 'country', 'blues', 'jazz', 'punk', 'metal', 'folk', 'jam', 'electronic', 'edm', 'reggae', 'rhythm and blues']

# Create a new df to fill
spotify_filtered_df = spotify_df[spotify_df['genres'].str.contains("hip hop", na=False)]

# Loop through the genre list and add those rows to the 
for genre in keep_list:
    test2 = spotify_df[spotify_df['genres'].str.contains(genre, na=False)]
    spotify_filtered_df = spotify_filtered_df.append(test2)

### IS THERE A WAY TO APPEND ROWS ONLY IF UNIQUE SO THE DROP DUPLICATES IS NOT NEEDED??
    
# Drop duplicate rows 
spotify_filtered_df = spotify_filtered_df.drop_duplicates(subset=['artist', 'name'])

# Sort by ID
spotify_filtered_df = spotify_filtered_df.sort_values(by=['id'])
spotify_filtered_df.head()

Unnamed: 0,id,artist,name,genres,popularity,year,duration_ms,tempo,key,mode,acousticness,instrumentalness,danceability,energy,liveness,loudness,speechiness,valence,explicit
165680,000G1xMMuwxNHmwVsBdtj1,Blondie,Will Anything Happen,"candy pop, dance rock, new romantic, new wave,...",27,1978,182347,191.307,2,1,0.0131,0.000106,0.256,0.895,0.0821,-4.86,0.0707,0.555,0
9542,000ZxLGm7jDlWCHtcXSeBe,"Meade ""Lux"" Lewis",Torpedo Juice,"boogie-woogie, piano blues, ragtime, stride",1,1954,314667,97.694,2,0,0.795,0.878,0.685,0.483,0.113,-10.202,0.0337,0.854,0
123062,000u1dTg7y1XCDXi80hbBX,Texas,I Don't Want A Lover,"country, country road, country rock",61,1989,300600,120.484,7,1,0.196,0.000487,0.756,0.47,0.126,-12.615,0.0394,0.43,0
139777,000x2qE0ZI3hodeVrnJK8A,Heart,(Love Me Like Music) I'll Be Your Song,"album rock, classic rock, country rock, folk r...",34,1975,200627,134.248,0,1,0.339,0.0,0.507,0.356,0.18,-14.24,0.0306,0.472,0
77749,000xQL6tZNLJzIrtIgxqSl,ZAYN,Still Got Time (feat. PARTYNEXTDOOR),"dance pop, electropop, pop, post-teen pop, uk pop",64,2017,188491,120.963,7,1,0.131,0.0,0.748,0.627,0.0852,-6.029,0.0644,0.524,0


In [17]:
len(spotify_filtered_df)

102356

In [18]:
# lower case artist and title columns
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.lower()
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.lower()
spotify_filtered_df.head()

Unnamed: 0,id,artist,name,genres,popularity,year,duration_ms,tempo,key,mode,acousticness,instrumentalness,danceability,energy,liveness,loudness,speechiness,valence,explicit
165680,000G1xMMuwxNHmwVsBdtj1,blondie,will anything happen,"candy pop, dance rock, new romantic, new wave,...",27,1978,182347,191.307,2,1,0.0131,0.000106,0.256,0.895,0.0821,-4.86,0.0707,0.555,0
9542,000ZxLGm7jDlWCHtcXSeBe,"meade ""lux"" lewis",torpedo juice,"boogie-woogie, piano blues, ragtime, stride",1,1954,314667,97.694,2,0,0.795,0.878,0.685,0.483,0.113,-10.202,0.0337,0.854,0
123062,000u1dTg7y1XCDXi80hbBX,texas,i don't want a lover,"country, country road, country rock",61,1989,300600,120.484,7,1,0.196,0.000487,0.756,0.47,0.126,-12.615,0.0394,0.43,0
139777,000x2qE0ZI3hodeVrnJK8A,heart,(love me like music) i'll be your song,"album rock, classic rock, country rock, folk r...",34,1975,200627,134.248,0,1,0.339,0.0,0.507,0.356,0.18,-14.24,0.0306,0.472,0
77749,000xQL6tZNLJzIrtIgxqSl,zayn,still got time (feat. partynextdoor),"dance pop, electropop, pop, post-teen pop, uk pop",64,2017,188491,120.963,7,1,0.131,0.0,0.748,0.627,0.0852,-6.029,0.0644,0.524,0


In [19]:
# Test of song formatting - to remove all (), [] and everything inside (letters, numbers, space, .)
test = [{"name":"(hello hello) title of the song - version (extra. & % stuff) [other thing]", "artist" : "artist1"},
        {"name":"(hi) title of the another song [extra stuff] (other thing)", "artist" : "artist2"}]
test_df = pd.DataFrame(test)
test_df

Unnamed: 0,name,artist
0,(hello hello) title of the song - version (ext...,artist1
1,(hi) title of the another song [extra stuff] (...,artist2


In [20]:
#test_df["name"] = test_df["name"].str.replace(r'\([\w\s]+\)', '', regex=True)
#test_df["name"] = test_df["name"].str.replace(r'\[[\w\s]+\]', '', regex=True)
#test_df["name"] = test_df["name"].str.replace(r'^\s+', '', regex=True)
#test_df["name"] = test_df["name"].str.replace(r'\s+$', '', regex=True)
test_df["name"] = test_df["name"].str.replace(r'\([^\(\)]+\)', '', regex=True)
test_df["name"] = test_df["name"].str.replace(r'\[[^\[\]]+\]', '', regex=True)
test_df["name"] = test_df["name"].str.replace(r'\s-.+', '', regex=True) 
test_df["name"] = test_df["name"].str.replace(r'^\s+', '', regex=True)
test_df["name"] = test_df["name"].str.replace(r'\s+$', '', regex=True)


test_df

Unnamed: 0,name,artist
0,title of the song,artist1
1,title of the another song,artist2


In [21]:
# Create regex to find "Featuring" artists
feat_string = r'\s[f][e][a][t][u][r][e][s].+$|\s[f][e][a][t][u][r][i][n][g].+$|\s[f][e][a][t]\..+$'

In [22]:
# Apply formatting/removing to title column 
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\([^\(\)]+\)', '', regex=True) #(...)
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\[[^\[\]]+\]', '', regex=True) #[...] 
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\s-.+', '', regex=True) # -...
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\s&.+', '', regex=True)  # &...
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(feat_string, '', regex=True) # feat...
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\/.+', '', regex=True) # /...
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\\.+', '', regex=True) # \...
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r';.+', '',  regex=True) # ;...
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'_.+', '',  regex=True) # _...
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\s\s+', ' ',  regex=True ) # more than one blank space into one
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'^\s+', '', regex=True) # blank spaces at begiining
spotify_filtered_df['name'] = spotify_filtered_df['name'].str.replace(r'\s+$', '', regex=True) # blank spaces at end


spotify_filtered_df.head()

Unnamed: 0,id,artist,name,genres,popularity,year,duration_ms,tempo,key,mode,acousticness,instrumentalness,danceability,energy,liveness,loudness,speechiness,valence,explicit
165680,000G1xMMuwxNHmwVsBdtj1,blondie,will anything happen,"candy pop, dance rock, new romantic, new wave,...",27,1978,182347,191.307,2,1,0.0131,0.000106,0.256,0.895,0.0821,-4.86,0.0707,0.555,0
9542,000ZxLGm7jDlWCHtcXSeBe,"meade ""lux"" lewis",torpedo juice,"boogie-woogie, piano blues, ragtime, stride",1,1954,314667,97.694,2,0,0.795,0.878,0.685,0.483,0.113,-10.202,0.0337,0.854,0
123062,000u1dTg7y1XCDXi80hbBX,texas,i don't want a lover,"country, country road, country rock",61,1989,300600,120.484,7,1,0.196,0.000487,0.756,0.47,0.126,-12.615,0.0394,0.43,0
139777,000x2qE0ZI3hodeVrnJK8A,heart,i'll be your song,"album rock, classic rock, country rock, folk r...",34,1975,200627,134.248,0,1,0.339,0.0,0.507,0.356,0.18,-14.24,0.0306,0.472,0
77749,000xQL6tZNLJzIrtIgxqSl,zayn,still got time,"dance pop, electropop, pop, post-teen pop, uk pop",64,2017,188491,120.963,7,1,0.131,0.0,0.748,0.627,0.0852,-6.029,0.0644,0.524,0


In [23]:
# Apply same formatting/removing to artist column 
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\([^\(\)]+\)', '', regex=True)
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\[[^\[\]]+\]', '', regex=True)
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\s-.+', '', regex=True) 
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\s&.+', '', regex=True)
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(feat_string, '', regex=True)
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\/.+', '', regex=True) 
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\\.+', '', regex=True) 
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r';.+', '',  regex=True)
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'_.+', '',  regex=True) 
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\s\s+', ' ',  regex=True )
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'^\s+', '', regex=True)
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'\s+$', '', regex=True)
spotify_filtered_df['artist'] = spotify_filtered_df['artist'].str.replace(r'"', '', regex=True) # remove "" in artist name

In [24]:
# Remove rows with non-ascii characters
# https://stackoverflow.com/questions/46094325/python-removing-non-ascii-characters-from-csv-file-using-pandas
spotify_filtered_df = spotify_filtered_df[~spotify_filtered_df['artist'].str.contains(r'[^\x00-\x7F]+')]
spotify_filtered_df = spotify_filtered_df[~spotify_filtered_df['name'].str.contains(r'[^\x00-\x7F]+')]
spotify_filtered_df.head()

Unnamed: 0,id,artist,name,genres,popularity,year,duration_ms,tempo,key,mode,acousticness,instrumentalness,danceability,energy,liveness,loudness,speechiness,valence,explicit
165680,000G1xMMuwxNHmwVsBdtj1,blondie,will anything happen,"candy pop, dance rock, new romantic, new wave,...",27,1978,182347,191.307,2,1,0.0131,0.000106,0.256,0.895,0.0821,-4.86,0.0707,0.555,0
9542,000ZxLGm7jDlWCHtcXSeBe,meade lux lewis,torpedo juice,"boogie-woogie, piano blues, ragtime, stride",1,1954,314667,97.694,2,0,0.795,0.878,0.685,0.483,0.113,-10.202,0.0337,0.854,0
123062,000u1dTg7y1XCDXi80hbBX,texas,i don't want a lover,"country, country road, country rock",61,1989,300600,120.484,7,1,0.196,0.000487,0.756,0.47,0.126,-12.615,0.0394,0.43,0
139777,000x2qE0ZI3hodeVrnJK8A,heart,i'll be your song,"album rock, classic rock, country rock, folk r...",34,1975,200627,134.248,0,1,0.339,0.0,0.507,0.356,0.18,-14.24,0.0306,0.472,0
77749,000xQL6tZNLJzIrtIgxqSl,zayn,still got time,"dance pop, electropop, pop, post-teen pop, uk pop",64,2017,188491,120.963,7,1,0.131,0.0,0.748,0.627,0.0852,-6.029,0.0644,0.524,0


In [25]:
len(spotify_filtered_df)

97754

In [26]:
new_column_names = {
    "id": "spot_id",
    "artist": "artist_name",
    "name": "song_title",
    "genres": "features_genres",
    "popularity": "feature_popularity",
    "year": "song_year",
    "duration_ms": "feature_duration",
    "tempo": "feature_tempo",
    "key": "feature_key",
    "mode": "feature_mode",
    "acousticness": "feature_acousticness",
    "instrumentalness": "feature_instrumentalness",
    "danceability": "feature_danceability",
    "energy": "feature_energy",
    "liveness": "feature_liveness",
    "loudness": "feature_loudness",
    "speechiness": "feature_speechiness",
    "valence": "feature_valence",
    "explicit": "feature_explicit"
}

In [27]:
spotify_filtered_df = spotify_filtered_df.rename(columns = new_column_names)
spotify_filtered_df.head()

Unnamed: 0,spot_id,artist_name,song_title,features_genres,feature_popularity,song_year,feature_duration,feature_tempo,feature_key,feature_mode,feature_acousticness,feature_instrumentalness,feature_danceability,feature_energy,feature_liveness,feature_loudness,feature_speechiness,feature_valence,feature_explicit
165680,000G1xMMuwxNHmwVsBdtj1,blondie,will anything happen,"candy pop, dance rock, new romantic, new wave,...",27,1978,182347,191.307,2,1,0.0131,0.000106,0.256,0.895,0.0821,-4.86,0.0707,0.555,0
9542,000ZxLGm7jDlWCHtcXSeBe,meade lux lewis,torpedo juice,"boogie-woogie, piano blues, ragtime, stride",1,1954,314667,97.694,2,0,0.795,0.878,0.685,0.483,0.113,-10.202,0.0337,0.854,0
123062,000u1dTg7y1XCDXi80hbBX,texas,i don't want a lover,"country, country road, country rock",61,1989,300600,120.484,7,1,0.196,0.000487,0.756,0.47,0.126,-12.615,0.0394,0.43,0
139777,000x2qE0ZI3hodeVrnJK8A,heart,i'll be your song,"album rock, classic rock, country rock, folk r...",34,1975,200627,134.248,0,1,0.339,0.0,0.507,0.356,0.18,-14.24,0.0306,0.472,0
77749,000xQL6tZNLJzIrtIgxqSl,zayn,still got time,"dance pop, electropop, pop, post-teen pop, uk pop",64,2017,188491,120.963,7,1,0.131,0.0,0.748,0.627,0.0852,-6.029,0.0644,0.524,0


In [28]:
# extract the "features_genres" column into a DF, and drop it from spotify_filtered_df
genres_df = spotify_filtered_df[["spot_id", "features_genres"]]
spotify_filtered_df = spotify_filtered_df.drop(columns=["features_genres"], axis=1)
genres_df = genres_df.reset_index(drop=True)
genres_df.head()

Unnamed: 0,spot_id,features_genres
0,000G1xMMuwxNHmwVsBdtj1,"candy pop, dance rock, new romantic, new wave,..."
1,000ZxLGm7jDlWCHtcXSeBe,"boogie-woogie, piano blues, ragtime, stride"
2,000u1dTg7y1XCDXi80hbBX,"country, country road, country rock"
3,000x2qE0ZI3hodeVrnJK8A,"album rock, classic rock, country rock, folk r..."
4,000xQL6tZNLJzIrtIgxqSl,"dance pop, electropop, pop, post-teen pop, uk pop"


In [29]:
# getting the main genre (from the genre_list) out of "genres" column into new column "genre_1"

genre_list = ['rap','rock', 'pop', 'country', 'blues', 'hip hop', 'jazz', 'punk', 'metal', 
              'folk', 'jam', 'electronic', 'edm', 'reggae', 'rhythm and blues', 'rnb', 'latin', 'world']

for i in range(len(genres_df)) : 
    
    dico={}
    
    for genre in genre_list:
        if genres_df.loc[i, "features_genres"].find(genre) !=-1:
            dico[genre]= genres_df.loc[i, "features_genres"].find(genre)
                    
    if len(dico)==1:
        genres_df.loc[i, "genre_1"]= list(dico.keys())[0]
            
    elif len(dico)>1:
        j = 1
        first = 0
        while j <(len(dico)):
            if list(dico.values())[j]<list(dico.values())[j-1]:
                first = j
            j=j+1
        genres_df.loc[i, "genre_1"]= list(dico.keys())[first]    
        
        
genres_df.head(10)  

Unnamed: 0,spot_id,features_genres,genre_1
0,000G1xMMuwxNHmwVsBdtj1,"candy pop, dance rock, new romantic, new wave,...",pop
1,000ZxLGm7jDlWCHtcXSeBe,"boogie-woogie, piano blues, ragtime, stride",blues
2,000u1dTg7y1XCDXi80hbBX,"country, country road, country rock",country
3,000x2qE0ZI3hodeVrnJK8A,"album rock, classic rock, country rock, folk r...",country
4,000xQL6tZNLJzIrtIgxqSl,"dance pop, electropop, pop, post-teen pop, uk pop",pop
5,001UkMQHw4zXfFNdKpwXAF,"contemporary country, country, country road",country
6,0024tEymsoc9FyKUauQngQ,"cyberpunk, new age",punk
7,002CcxKpBE1tfKOy2CRaWr,"chutney, classic bollywood, desi pop, filmi, g...",pop
8,002aR3zqP6SvscCnPT44on,"jazz funk, smooth jazz",jazz
9,002zOHMdBKYgNGtmmHSE2D,"adult standards, lounge, torch song, vocal jazz",jazz


In [30]:
genres_df.genre_1.value_counts()

rock                21964
pop                 20716
jazz                12864
country              8818
hip hop              6221
folk                 5872
metal                5128
blues                5011
rap                  3023
punk                 2440
latin                2435
reggae               1208
rhythm and blues      674
electronic            546
edm                   539
jam                   233
world                  62
Name: genre_1, dtype: int64

In [31]:
genres_df["genre_1"].isnull().sum()

0

In [32]:
# add the "genre_1" column to spotify_filtered_df, and rename "genre_1"

spotify_filtered_df = spotify_filtered_df.merge(genres_df[["spot_id","genre_1"]], left_on="spot_id", right_on="spot_id")
#spotify_filtered_df = spotify_filtered_df.drop("features_genres", axis=1)
spotify_filtered_df = spotify_filtered_df.rename(columns={"genre_1": "feature_genre"})

spotify_filtered_df.head()

Unnamed: 0,spot_id,artist_name,song_title,feature_popularity,song_year,feature_duration,feature_tempo,feature_key,feature_mode,feature_acousticness,feature_instrumentalness,feature_danceability,feature_energy,feature_liveness,feature_loudness,feature_speechiness,feature_valence,feature_explicit,feature_genre
0,000G1xMMuwxNHmwVsBdtj1,blondie,will anything happen,27,1978,182347,191.307,2,1,0.0131,0.000106,0.256,0.895,0.0821,-4.86,0.0707,0.555,0,pop
1,000ZxLGm7jDlWCHtcXSeBe,meade lux lewis,torpedo juice,1,1954,314667,97.694,2,0,0.795,0.878,0.685,0.483,0.113,-10.202,0.0337,0.854,0,blues
2,000u1dTg7y1XCDXi80hbBX,texas,i don't want a lover,61,1989,300600,120.484,7,1,0.196,0.000487,0.756,0.47,0.126,-12.615,0.0394,0.43,0,country
3,000x2qE0ZI3hodeVrnJK8A,heart,i'll be your song,34,1975,200627,134.248,0,1,0.339,0.0,0.507,0.356,0.18,-14.24,0.0306,0.472,0,country
4,000xQL6tZNLJzIrtIgxqSl,zayn,still got time,64,2017,188491,120.963,7,1,0.131,0.0,0.748,0.627,0.0852,-6.029,0.0644,0.524,0,pop


In [33]:
spotify_filtered_df.shape

(97754, 19)

In [34]:
# reorder columns
spotify_filtered_df = spotify_filtered_df[[
    "spot_id",
    "artist_name",
    "song_title",
    "song_year",
    "feature_genre",
    "feature_popularity",
    "feature_duration",
    "feature_key",
    "feature_acousticness",
    "feature_instrumentalness",
    "feature_tempo",
    "feature_mode",
    "feature_danceability",
    "feature_energy",
    "feature_liveness",
    "feature_loudness",
    "feature_speechiness",
    "feature_valence",
    "feature_explicit"   
]]
spotify_filtered_df.head()

Unnamed: 0,spot_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,feature_tempo,feature_mode,feature_danceability,feature_energy,feature_liveness,feature_loudness,feature_speechiness,feature_valence,feature_explicit
0,000G1xMMuwxNHmwVsBdtj1,blondie,will anything happen,1978,pop,27,182347,2,0.0131,0.000106,191.307,1,0.256,0.895,0.0821,-4.86,0.0707,0.555,0
1,000ZxLGm7jDlWCHtcXSeBe,meade lux lewis,torpedo juice,1954,blues,1,314667,2,0.795,0.878,97.694,0,0.685,0.483,0.113,-10.202,0.0337,0.854,0
2,000u1dTg7y1XCDXi80hbBX,texas,i don't want a lover,1989,country,61,300600,7,0.196,0.000487,120.484,1,0.756,0.47,0.126,-12.615,0.0394,0.43,0
3,000x2qE0ZI3hodeVrnJK8A,heart,i'll be your song,1975,country,34,200627,0,0.339,0.0,134.248,1,0.507,0.356,0.18,-14.24,0.0306,0.472,0
4,000xQL6tZNLJzIrtIgxqSl,zayn,still got time,2017,pop,64,188491,7,0.131,0.0,120.963,1,0.748,0.627,0.0852,-6.029,0.0644,0.524,0


## export to Postgres - create table "spotify" in Postgres first

In [35]:
# Connect to database (Note: The package psychopg2 is required for Postgres to work with SQLAlchemy)
db_string = f"postgres://postgres:{db_pwd}@127.0.0.1:5432/Platinum_Lyrics"

engine = create_engine(db_string)

In [36]:
for ind in range(0,97):
    spotify_filtered_df[ind*1000:ind*1000+1000].to_sql(name='spotify', con=engine, index=False, 
          if_exists='append')
    print(f"Chunk {ind*1000}-{ind*1000+1000} exported")
    print("---------------------")

spotify_filtered_df[97000:97755].to_sql(name='spotify', con=engine, index=False, if_exists='append') 
print(f"Chunk 97000-97755 exported")

print("Export Successful") 

Chunk 0-1000 exported
---------------------
Chunk 1000-2000 exported
---------------------
Chunk 2000-3000 exported
---------------------
Chunk 3000-4000 exported
---------------------
Chunk 4000-5000 exported
---------------------
Chunk 5000-6000 exported
---------------------
Chunk 6000-7000 exported
---------------------
Chunk 7000-8000 exported
---------------------
Chunk 8000-9000 exported
---------------------
Chunk 9000-10000 exported
---------------------
Chunk 10000-11000 exported
---------------------
Chunk 11000-12000 exported
---------------------
Chunk 12000-13000 exported
---------------------
Chunk 13000-14000 exported
---------------------
Chunk 14000-15000 exported
---------------------
Chunk 15000-16000 exported
---------------------
Chunk 16000-17000 exported
---------------------
Chunk 17000-18000 exported
---------------------
Chunk 18000-19000 exported
---------------------
Chunk 19000-20000 exported
---------------------
Chunk 20000-21000 exported
---------------

In [37]:
# Export cleaned spotify data
spotify_filtered_df.to_csv("../Resources/spotify_cleaned.csv", index=False)