In [None]:
import pandas as pd
import numpy as np
import psycopg2
import re
from sqlalchemy import create_engine
from config import db_pwd, RDS_pwd

In [None]:
# update Postgres Connection before importing : local or RDS

In [2]:
# Connect to local database:
#db_string = f"postgres://postgres:{db_pwd}@127.0.0.1:5432/Platinum_Lyrics"

# Connect to RDS Database:
db_string = f"postgres://postgres:{RDS_pwd}@platinum-rds.cbu3an3ywyth.us-east-2.rds.amazonaws.com/Platinum_Lyrics"

engine = create_engine(db_string)

## platinum lyrics

In [3]:
platinum_lyrics = pd.read_sql_table("platinum_lyrics",engine)
platinum_lyrics.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,target_success,target_weeks,target_peak,word_abov,word_accept,word_ach,...,word_yeah,word_year,word_yellow,word_yes,word_yesterday,word_yet,word_york,word_young,word_yourself,word_youth
0,TRAAAAV128F421A322,western addiction,a poor recipe for civic cohesion,2005,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TRAAABD128F429CF47,the box tops,soul deep,1969,1,18,14,0,0,0,...,4,0,0,0,0,0,0,0,0,0
2,TRAAAGF12903CEC202,halvdan sivertsen,smã¥ ord,2005,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1984,1,29,95,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TRAABEV12903CC53A4,suicide commando,blood in face,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
platinum_lyrics.shape

(39094, 1554)

In [5]:
column_names = platinum_lyrics.columns.values
column_names

array(['track_id', 'artist_name', 'song_title', ..., 'word_young',
       'word_yourself', 'word_youth'], dtype=object)

In [6]:
# new column names (removing "word_")
new_column_names = {}
for column in column_names:
    new_column_names[column] = column.replace('word_', '')

new_column_names    

{'track_id': 'track_id',
 'artist_name': 'artist_name',
 'song_title': 'song_title',
 'song_year': 'song_year',
 'target_success': 'target_success',
 'target_weeks': 'target_weeks',
 'target_peak': 'target_peak',
 'word_abov': 'abov',
 'word_accept': 'accept',
 'word_ach': 'ach',
 'word_across': 'across',
 'word_act': 'act',
 'word_action': 'action',
 'word_addict': 'addict',
 'word_admit': 'admit',
 'word_ador': 'ador',
 'word_afraid': 'afraid',
 'word_against': 'against',
 'word_age': 'age',
 'word_ago': 'ago',
 'word_ahead': 'ahead',
 'word_aim': 'aim',
 'word_air': 'air',
 'word_album': 'album',
 'word_algo': 'algo',
 'word_aliv': 'aliv',
 'word_alla': 'alla',
 'word_alma': 'alma',
 'word_almost': 'almost',
 'word_alon': 'alon',
 'word_along': 'along',
 'word_alreadi': 'alreadi',
 'word_alright': 'alright',
 'word_although': 'although',
 'word_alway': 'alway',
 'word_amaz': 'amaz',
 'word_america': 'america',
 'word_american': 'american',
 'word_amigo': 'amigo',
 'word_amo': 'amo',

In [7]:
platinum_lyrics = platinum_lyrics.rename(columns=new_column_names)
platinum_lyrics.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,target_success,target_weeks,target_peak,abov,accept,ach,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
0,TRAAAAV128F421A322,western addiction,a poor recipe for civic cohesion,2005,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TRAAABD128F429CF47,the box tops,soul deep,1969,1,18,14,0,0,0,...,4,0,0,0,0,0,0,0,0,0
2,TRAAAGF12903CEC202,halvdan sivertsen,smã¥ ord,2005,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1984,1,29,95,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TRAABEV12903CC53A4,suicide commando,blood in face,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# export to csv
platinum_lyrics.to_csv("../Resources/platinum_lyrics.csv")

## platinum lyrics features

In [9]:
platinum_lyrics_features = pd.read_sql_table("platinum_lyrics_features",engine)
platinum_lyrics_features.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,...,word_yeah,word_year,word_yellow,word_yes,word_yesterday,word_yet,word_york,word_young,word_yourself,word_youth
0,TRAAABD128F429CF47,the box tops,soul deep,1969,pop,38,147760,9,0.195,3.5e-05,...,4,0,0,0,0,0,0,0,0,0
1,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1984,pop,78,184480,10,0.195,0.0,...,0,0,0,0,0,0,0,0,0,0
2,TRAADKW128E079503A,tracy chapman,fast car,1988,folk,78,296800,4,0.313,0.0,...,0,0,0,0,0,0,0,1,0,0
3,TRAAFEU128E078581C,r.e.m.,fall on me,1986,rock,48,169920,0,0.0711,0.000391,...,0,0,0,0,0,0,0,0,0,0
4,TRAAGMC128F4292D0F,little texas,my love,1994,country,44,245800,9,0.377,0.0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
platinum_lyrics_features.shape

(8692, 1569)

In [11]:
platinum_lyrics_features = platinum_lyrics_features.rename(columns=new_column_names)
platinum_lyrics_features.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
0,TRAAABD128F429CF47,the box tops,soul deep,1969,pop,38,147760,9,0.195,3.5e-05,...,4,0,0,0,0,0,0,0,0,0
1,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1984,pop,78,184480,10,0.195,0.0,...,0,0,0,0,0,0,0,0,0,0
2,TRAADKW128E079503A,tracy chapman,fast car,1988,folk,78,296800,4,0.313,0.0,...,0,0,0,0,0,0,0,1,0,0
3,TRAAFEU128E078581C,r.e.m.,fall on me,1986,rock,48,169920,0,0.0711,0.000391,...,0,0,0,0,0,0,0,0,0,0
4,TRAAGMC128F4292D0F,little texas,my love,1994,country,44,245800,9,0.377,0.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# export to csv
platinum_lyrics_features.to_csv("../Resources/platinum_lyrics_features.csv")

## platinum features

In [13]:
platinum_features = pd.read_sql_table("platinum_features",engine)

platinum_features.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,...,feature_danceability,feature_energy,feature_liveness,feature_loudness,feature_speechiness,feature_valence,feature_explicit,target_success,target_weeks,target_peak
0,TRRBOBU128F4293068,texas,i don't want a lover,1989,country,61,300600,7,0.196,0.000487,...,0.756,0.47,0.126,-12.615,0.0394,0.43,0,1,77,6
1,TRVCPQS128F4285928,the youngbloods,ride the wind,1988,rock,20,396600,2,0.91,0.651,...,0.558,0.307,0.0866,-20.492,0.0343,0.674,0,0,0,0
2,TRZRMWW128F426E797,babyface,tender lover,1990,pop,37,259267,5,0.226,0.000422,...,0.743,0.86,0.0513,-6.346,0.0445,0.687,0,1,17,89
3,TRVSRVI128F4261843,reo speedwagon,one lonely night,1984,country,31,201467,5,0.0561,0.000149,...,0.408,0.579,0.0712,-10.277,0.026,0.397,0,1,19,16
4,TRSHXOI128F146B1AE,john waite,change,1982,rock,49,196693,11,0.246,0.0,...,0.403,0.848,0.616,-11.615,0.0625,0.622,0,1,54,10


In [14]:
platinum_lyrics_features.shape

(8692, 1571)

In [15]:
# export to csv
platinum_features.to_csv("../Resources/platinum_features.csv")

## platinum spotify

In [29]:
platinum_spotify = pd.read_sql_table("platinum_spotify",engine)
platinum_spotify.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,...,word_yeah,word_year,word_yellow,word_yes,word_yesterday,word_yet,word_york,word_young,word_yourself,word_youth
0,TRAAABD128F429CF47,the box tops,soul deep,1969,pop,38,147760,9,0.195,3.5e-05,...,4,0,0,0,0,0,0,0,0,0
1,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1983,pop,78,184480,10,0.195,0.0,...,0,0,0,0,0,0,0,0,0,0
2,TRAABHB12903CAFC2F,bauhaus,a god in an alcove,1980,rock,34,248267,4,0.0445,6e-06,...,0,0,0,0,0,0,0,0,0,0
3,TRAABJV128F1460C49,lionel richie,tonight will be alright,1986,rock,38,307413,8,0.0591,0.0141,...,0,0,0,0,0,0,0,0,0,0
4,TRAACER128F4290F96,the dillinger escape plan,setting fire to sleeping giants,2004,metal,43,207773,9,0.0093,0.000562,...,1,0,0,0,0,0,0,0,0,0


In [30]:
platinum_spotify.shape

(23724, 1568)

In [31]:
column_names = platinum_spotify.columns.values
column_names

array(['track_id', 'artist_name', 'song_title', ..., 'word_young',
       'word_yourself', 'word_youth'], dtype=object)

In [32]:
# new column names (removing "word_")
new_column_names = {}
for column in column_names:
    new_column_names[column] = column.replace('word_', '')

new_column_names   

{'track_id': 'track_id',
 'artist_name': 'artist_name',
 'song_title': 'song_title',
 'song_year': 'song_year',
 'feature_genre': 'feature_genre',
 'feature_popularity': 'feature_popularity',
 'feature_duration': 'feature_duration',
 'feature_key': 'feature_key',
 'feature_acousticness': 'feature_acousticness',
 'feature_instrumentalness': 'feature_instrumentalness',
 'feature_tempo': 'feature_tempo',
 'feature_mode': 'feature_mode',
 'feature_danceability': 'feature_danceability',
 'feature_energy': 'feature_energy',
 'feature_liveness': 'feature_liveness',
 'feature_loudness': 'feature_loudness',
 'feature_speechiness': 'feature_speechiness',
 'feature_valence': 'feature_valence',
 'feature_explicit': 'feature_explicit',
 'word_abov': 'abov',
 'word_accept': 'accept',
 'word_ach': 'ach',
 'word_across': 'across',
 'word_act': 'act',
 'word_action': 'action',
 'word_addict': 'addict',
 'word_admit': 'admit',
 'word_ador': 'ador',
 'word_afraid': 'afraid',
 'word_against': 'against',
 

In [33]:
platinum_spotify = platinum_spotify.rename(columns=new_column_names)
platinum_spotify = platinum_spotify.drop(columns=["cost", "oder"], axis=1)
platinum_spotify.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
0,TRAAABD128F429CF47,the box tops,soul deep,1969,pop,38,147760,9,0.195,3.5e-05,...,4,0,0,0,0,0,0,0,0,0
1,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1983,pop,78,184480,10,0.195,0.0,...,0,0,0,0,0,0,0,0,0,0
2,TRAABHB12903CAFC2F,bauhaus,a god in an alcove,1980,rock,34,248267,4,0.0445,6e-06,...,0,0,0,0,0,0,0,0,0,0
3,TRAABJV128F1460C49,lionel richie,tonight will be alright,1986,rock,38,307413,8,0.0591,0.0141,...,0,0,0,0,0,0,0,0,0,0
4,TRAACER128F4290F96,the dillinger escape plan,setting fire to sleeping giants,2004,metal,43,207773,9,0.0093,0.000562,...,1,0,0,0,0,0,0,0,0,0


In [34]:
# export to csv
platinum_spotify.to_csv("../Resources/platinum_spotify.csv")

## Platinum location

In [10]:
platinum_location = pd.read_sql_table("platinum_location",engine)
platinum_location.head()

Unnamed: 0,track_id,artist_name,song_title,lat,long,location,word_abov,word_accept,word_ach,word_across,...,word_yeah,word_year,word_yellow,word_yes,word_yesterday,word_yet,word_york,word_young,word_yourself,word_youth
0,TRAAKXQ12903CC0143,art lund,don't cry,40.75952,-111.88823,"Salt Lake City, UT",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TRAALGY12903CBA247,echolyn,the cheese stands alone,40.99471,-77.60454,Pennsylvania,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,TRAEXRM12903CBA5A5,gibson,it's love,38.2589,-92.43659,Missouri,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
3,TRAGCBW12903CEA424,hibernation,hibernation,51.50632,-0.12714,UK - England - London,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TRAMDKP128F425BFD8,top of the fair,the san andreas fault,41.51776,-72.75753,Connecticut,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
column_names = platinum_location.columns.values
column_names

array(['track_id', 'artist_name', 'song_title', ..., 'word_young',
       'word_yourself', 'word_youth'], dtype=object)

In [12]:
# new column names (removing "word_")
new_column_names = {}
for column in column_names:
    new_column_names[column] = column.replace('word_', '')

new_column_names   

{'track_id': 'track_id',
 'artist_name': 'artist_name',
 'song_title': 'song_title',
 'lat': 'lat',
 'long': 'long',
 'location': 'location',
 'word_abov': 'abov',
 'word_accept': 'accept',
 'word_ach': 'ach',
 'word_across': 'across',
 'word_act': 'act',
 'word_action': 'action',
 'word_addict': 'addict',
 'word_admit': 'admit',
 'word_ador': 'ador',
 'word_afraid': 'afraid',
 'word_against': 'against',
 'word_age': 'age',
 'word_ago': 'ago',
 'word_ahead': 'ahead',
 'word_aim': 'aim',
 'word_air': 'air',
 'word_album': 'album',
 'word_algo': 'algo',
 'word_aliv': 'aliv',
 'word_alla': 'alla',
 'word_alma': 'alma',
 'word_almost': 'almost',
 'word_alon': 'alon',
 'word_along': 'along',
 'word_alreadi': 'alreadi',
 'word_alright': 'alright',
 'word_although': 'although',
 'word_alway': 'alway',
 'word_amaz': 'amaz',
 'word_america': 'america',
 'word_american': 'american',
 'word_amigo': 'amigo',
 'word_amo': 'amo',
 'word_amor': 'amor',
 'word_amour': 'amour',
 'word_angel': 'angel',


In [13]:
platinum_location = platinum_location.rename(columns=new_column_names)
platinum_location.head()

Unnamed: 0,track_id,artist_name,song_title,lat,long,location,abov,accept,ach,across,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
0,TRAAKXQ12903CC0143,art lund,don't cry,40.75952,-111.88823,"Salt Lake City, UT",0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,TRAALGY12903CBA247,echolyn,the cheese stands alone,40.99471,-77.60454,Pennsylvania,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,TRAEXRM12903CBA5A5,gibson,it's love,38.2589,-92.43659,Missouri,0,0,0,0,...,0,0,0,0,0,2,0,0,0,0
3,TRAGCBW12903CEA424,hibernation,hibernation,51.50632,-0.12714,UK - England - London,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,TRAMDKP128F425BFD8,top of the fair,the san andreas fault,41.51776,-72.75753,Connecticut,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# platinum_location = platinum_location.drop(columns=["cost", "oder"], axis=1)

In [18]:
# export to csv
platinum_location.to_csv("../Resources/platinum_location.csv")

In [16]:
platinum_lyrics_location = pd.read_sql_table("platinum_lyrics_location",engine)
platinum_lyrics_location.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,target_success,target_weeks,target_peak,word_abov,word_accept,word_ach,...,word_yes,word_yesterday,word_yet,word_york,word_young,word_yourself,word_youth,lat,long,location
0,TRAXRLU128F423274D,tka,louder than love,1991,1,11,88,0,0,0,...,0,0,0,0,0,0,0,46.71067,1.71819,FRANCE
1,TRBBHFE12903CA5EEB,paris,hip replacement,2002,0,0,0,0,0,0,...,0,0,0,0,0,0,0,43.07295,-89.38669,"Madison, WI"
2,TRBIQHG128F92F4F9F,ventana,the dying sound,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,41.50471,-81.69074,Cleveland
3,TRBMLWH128F1459C3E,the cox family,i am weary,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,41.26069,-95.93995,"OMAHA, Nebraska"
4,TRBNEET128F145AC89,russ ballard,the last time,1984,0,0,0,0,0,0,...,2,0,0,0,0,0,0,51.69051,-0.03381,"Waltham Cross, Hertfordshire, Eng"


In [21]:
platinum_lyrics_location = platinum_lyrics_location.rename(columns=new_column_names)
platinum_lyrics_location.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,target_success,target_weeks,target_peak,abov,accept,ach,...,yes,yesterday,yet,york,young,yourself,youth,lat,long,location
0,TRAXRLU128F423274D,tka,louder than love,1991,1,11,88,0,0,0,...,0,0,0,0,0,0,0,46.71067,1.71819,FRANCE
1,TRBBHFE12903CA5EEB,paris,hip replacement,2002,0,0,0,0,0,0,...,0,0,0,0,0,0,0,43.07295,-89.38669,"Madison, WI"
2,TRBIQHG128F92F4F9F,ventana,the dying sound,2008,0,0,0,0,0,0,...,0,0,0,0,0,0,0,41.50471,-81.69074,Cleveland
3,TRBMLWH128F1459C3E,the cox family,i am weary,2000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,41.26069,-95.93995,"OMAHA, Nebraska"
4,TRBNEET128F145AC89,russ ballard,the last time,1984,0,0,0,0,0,0,...,2,0,0,0,0,0,0,51.69051,-0.03381,"Waltham Cross, Hertfordshire, Eng"


In [22]:
platinum_lyrics_location = platinum_lyrics_location.drop(columns=["cost", "oder"], axis=1)

In [27]:
platinum_lyrics_location.shape

(427, 1557)

In [23]:
# export to csv
platinum_lyrics_location.to_csv("../Resources/platinum_lyrics_location.csv")

In [24]:
platinum_spotify_location = pd.read_sql_table("platinum_spotify_location",engine)
platinum_spotify_location.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,...,word_yes,word_yesterday,word_yet,word_york,word_young,word_yourself,word_youth,lat,long,location
0,TRAXRLU128F423274D,tka,louder than love,1990,hip hop,42,318893,6,0.0941,0.109,...,0,0,0,0,0,0,0,46.71067,1.71819,FRANCE
1,TRBWEQF12903D0BB61,john paul young,love is in the air,2009,rock,63,210293,0,0.0254,0.000264,...,0,0,0,0,0,0,0,55.8578,-4.24251,"Glasgow, Scotland"
2,TRCEVIT128F932BA0E,zac brown band,highway 20 ride,2008,country,61,229280,3,0.551,1.4e-05,...,0,0,0,0,0,0,0,33.74831,-84.39111,"Atlanta, GA"
3,TRCFIDD128F932B011,james gang,ride the wind,1973,blues,31,226773,0,0.1,1e-06,...,0,0,0,0,0,0,0,41.50471,-81.69074,"Cleveland, OH"
4,TRCLRJF128F9349B59,rebelution,safe and sound,2007,reggae,60,229493,8,0.00545,0.0,...,0,0,0,0,0,0,0,34.41925,-119.69887,"Santa Barbara, CA"


In [25]:
platinum_spotify_location = platinum_spotify_location.rename(columns=new_column_names)
platinum_spotify_location = platinum_spotify_location.drop(columns=["cost", "oder"], axis=1)
platinum_spotify_location.head()

Unnamed: 0,track_id,artist_name,song_title,song_year,feature_genre,feature_popularity,feature_duration,feature_key,feature_acousticness,feature_instrumentalness,...,yes,yesterday,yet,york,young,yourself,youth,lat,long,location
0,TRAXRLU128F423274D,tka,louder than love,1990,hip hop,42,318893,6,0.0941,0.109,...,0,0,0,0,0,0,0,46.71067,1.71819,FRANCE
1,TRBWEQF12903D0BB61,john paul young,love is in the air,2009,rock,63,210293,0,0.0254,0.000264,...,0,0,0,0,0,0,0,55.8578,-4.24251,"Glasgow, Scotland"
2,TRCEVIT128F932BA0E,zac brown band,highway 20 ride,2008,country,61,229280,3,0.551,1.4e-05,...,0,0,0,0,0,0,0,33.74831,-84.39111,"Atlanta, GA"
3,TRCFIDD128F932B011,james gang,ride the wind,1973,blues,31,226773,0,0.1,1e-06,...,0,0,0,0,0,0,0,41.50471,-81.69074,"Cleveland, OH"
4,TRCLRJF128F9349B59,rebelution,safe and sound,2007,reggae,60,229493,8,0.00545,0.0,...,0,0,0,0,0,0,0,34.41925,-119.69887,"Santa Barbara, CA"


In [26]:
platinum_spotify_location.shape

(223, 1569)

In [28]:
# export to csv
platinum_spotify_location.to_csv("../Resources/platinum_spotify_location.csv")