In [1]:
import pandas as pd
import psycopg2
from sqlalchemy import create_engine
from config import db_pwd, RDS_pwd

In [2]:
platinum_lyrics_df = pd.read_csv("https://platinum-lyric-bucket.s3.us-east-2.amazonaws.com/platinum_lyrics.csv")
platinum_lyrics_df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artist_name,song_title,song_year,target_success,target_weeks,target_peak,abov,accept,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
0,0,TRAAAAV128F421A322,western addiction,a poor recipe for civic cohesion,2005,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,TRAAABD128F429CF47,the box tops,soul deep,1969,1,18,14,0,0,...,4,0,0,0,0,0,0,0,0,0
2,2,TRAAAGF12903CEC202,halvdan sivertsen,smã¥ ord,2005,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,TRAAAZF12903CCCF6B,matthew wilder,break my stride,1984,1,29,95,0,0,...,0,0,0,0,0,0,0,0,0,0
4,4,TRAABEV12903CC53A4,suicide commando,blood in face,2000,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [3]:
platinum_lyrics_df.target_success.value_counts()

0    32301
1     6793
Name: target_success, dtype: int64

In [4]:
success_words_df = platinum_lyrics_df.drop(['Unnamed: 0', 'song_year', 'target_weeks', 'target_peak'], axis=1).groupby('target_success').sum()
success_words_df.head()

Unnamed: 0_level_0,abov,accept,ach,across,act,action,addict,admit,ador,afraid,...,yeah,year,yellow,yes,yesterday,yet,york,young,yourself,youth
target_success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1602,290,398,1128,1251,500,277,319,266,1358,...,19361,3349,367,4072,703,1384,709,2277,2592,459
1,446,45,102,333,333,98,107,110,108,377,...,9233,831,89,1668,249,264,212,745,842,85


## Add totals and percent frequencies

In [5]:
# Add a total column and a total row
success_words_df["total_wordfreq"] = success_words_df.sum(axis=1, skipna = True)
success_words_df.loc["total"] = success_words_df.sum(axis=0)
success_words_df.head()

Unnamed: 0_level_0,abov,accept,ach,across,act,action,addict,admit,ador,afraid,...,year,yellow,yes,yesterday,yet,york,young,yourself,youth,total_wordfreq
target_success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1602,290,398,1128,1251,500,277,319,266,1358,...,3349,367,4072,703,1384,709,2277,2592,459,2859089
1,446,45,102,333,333,98,107,110,108,377,...,831,89,1668,249,264,212,745,842,85,874048
total,2048,335,500,1461,1584,598,384,429,374,1735,...,4180,456,5740,952,1648,921,3022,3434,544,3733137


In [6]:
# Add rows with percents of total, unsuccessful and successful words
success_words_df.loc['freq_unsuccessful'] = (success_words_df.iloc[0] / success_words_df.iloc[0]['total_wordfreq'])*100
success_words_df.loc['freq_successful'] = (success_words_df.iloc[1] / success_words_df.iloc[1]['total_wordfreq'])*100
success_words_df.loc['freq_total'] = (success_words_df.iloc[2] / success_words_df.iloc[2]['total_wordfreq'])*100
success_words_df.head(10)

Unnamed: 0_level_0,abov,accept,ach,across,act,action,addict,admit,ador,afraid,...,year,yellow,yes,yesterday,yet,york,young,yourself,youth,total_wordfreq
target_success,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1602.0,290.0,398.0,1128.0,1251.0,500.0,277.0,319.0,266.0,1358.0,...,3349.0,367.0,4072.0,703.0,1384.0,709.0,2277.0,2592.0,459.0,2859089.0
1,446.0,45.0,102.0,333.0,333.0,98.0,107.0,110.0,108.0,377.0,...,831.0,89.0,1668.0,249.0,264.0,212.0,745.0,842.0,85.0,874048.0
total,2048.0,335.0,500.0,1461.0,1584.0,598.0,384.0,429.0,374.0,1735.0,...,4180.0,456.0,5740.0,952.0,1648.0,921.0,3022.0,3434.0,544.0,3733137.0
freq_unsuccessful,0.056032,0.010143,0.013921,0.039453,0.043755,0.017488,0.009688,0.011157,0.009304,0.047498,...,0.117135,0.012836,0.142423,0.024588,0.048407,0.024798,0.079641,0.090658,0.016054,100.0
freq_successful,0.051027,0.005148,0.01167,0.038099,0.038099,0.011212,0.012242,0.012585,0.012356,0.043133,...,0.095075,0.010183,0.190836,0.028488,0.030204,0.024255,0.085236,0.096333,0.009725,100.0
freq_total,0.05486,0.008974,0.013394,0.039136,0.042431,0.016019,0.010286,0.011492,0.010018,0.046476,...,0.11197,0.012215,0.153758,0.025501,0.044145,0.024671,0.080951,0.091987,0.014572,100.0


In [7]:
# Create final dataframe with word frequencies
joined_df = pd.DataFrame(round(success_words_df.iloc[:,:1545].loc["freq_unsuccessful"], 2))
joined_df['freq_successful'] = round(success_words_df.iloc[:,:1545].loc["freq_successful"], 2)
joined_df['count_unsuccessful'] = success_words_df.iloc[:,:1545].iloc[0]
joined_df['count_successful'] = success_words_df.iloc[:,:1545].iloc[1]
joined_df.index.name = 'words'
joined_df.sort_index()

Unnamed: 0_level_0,freq_unsuccessful,freq_successful,count_unsuccessful,count_successful
words,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
abov,0.06,0.05,1602.0,446.0
accept,0.01,0.01,290.0,45.0
ach,0.01,0.01,398.0,102.0
across,0.04,0.04,1128.0,333.0
act,0.04,0.04,1251.0,333.0
...,...,...,...,...
yet,0.05,0.03,1384.0,264.0
york,0.02,0.02,709.0,212.0
young,0.08,0.09,2277.0,745.0
yourself,0.09,0.10,2592.0,842.0


In [10]:
# export to csv file that can be exported to S3
joined_df.to_csv('../Resources/word_freq.csv')

In [8]:
# Connect to local database:
#db_string = f"postgres://postgres:{db_pwd}@127.0.0.1:5432/Platinum_Lyrics"

# Connect to RDS Database:
db_string = f"postgres://postgres:{RDS_pwd}@platinum-rds.cbu3an3ywyth.us-east-2.rds.amazonaws.com/Platinum_Lyrics"

engine = create_engine(db_string)

In [9]:
for ind in range(0,15):
    joined_df[ind*100:ind*100+100].to_sql(name='word_freq', con=engine, index=True, 
          if_exists='append')
    print(f"Chunk {ind*100}-{ind*100+100} exported")
    print("---------------------")

joined_df[1500:1546].to_sql(name='word_freq', con=engine, index=True, if_exists='append') 
print(f"Chunk 1500-1546 exported")

print("Export Successful") 

Chunk 0-100 exported
---------------------
Chunk 100-200 exported
---------------------
Chunk 200-300 exported
---------------------
Chunk 300-400 exported
---------------------
Chunk 400-500 exported
---------------------
Chunk 500-600 exported
---------------------
Chunk 600-700 exported
---------------------
Chunk 700-800 exported
---------------------
Chunk 800-900 exported
---------------------
Chunk 900-1000 exported
---------------------
Chunk 1000-1100 exported
---------------------
Chunk 1100-1200 exported
---------------------
Chunk 1200-1300 exported
---------------------
Chunk 1300-1400 exported
---------------------
Chunk 1400-1500 exported
---------------------
Chunk 1500-1546 exported
Export Successful
