In [1]:
# Imports and Options
import pandas as pd
import numpy as np
import re
import random

random_seed=42

In [2]:
# Creating dataframes from each scrapers csv output
# /Users/jacobellena/Google-Drive/dsi/submissions/capstone/data/contemplator_songs.csv
df1 = pd.read_csv('../../data/contemplator_songs.csv')
df2 = pd.read_csv('../../data/karaoke_lyrics_songs.csv')
df3 = pd.read_csv('../../data/sailor_songs_songs.csv')
df4 = pd.read_csv('../../data/shanty_net_songs.csv')
df5 = pd.read_csv('../../data/traditional_music_songs.csv')

In [3]:
# Merging data frames into one frame
dataframes = [df1, df2, df3, df4, df5]

shanties = pd.concat(dataframes, axis=0)
shanties.head()

Unnamed: 0.1,Unnamed: 0,title,lyrics
0,0,A Hundred Years Ago,"A hundred years is a very long time,\nHo, yes,..."
1,1,"Aweigh, Santy Ano","From Boston Town we're bound away,\nHeave awei..."
2,2,The Black Ball Line,In the Black Ball Line I serv'd my time\nHurra...
3,3,Blow the Man Down,"Come all ye young fellows that follow the sea,..."
4,4,"Blow Boys, Blow","A Yankee ship came down the river\nBlow, boys,..."


In [4]:
# Checking shape
shanties.shape

(846, 3)

In [5]:
# Some lyrics pages were empty so checking for nulls
shanties.isnull().sum()

Unnamed: 0    0
title         0
lyrics        6
dtype: int64

In [6]:
# Dropping nulls
shanties.dropna(inplace=True)
shanties.shape

(840, 3)

In [7]:
# Cleaning index
# Duplicate songs will be dropped by name. The first duplicated song will be dropped
# In order to limit bias the dataframe will be reindexed with a random number

# Creating a random list 
random_range = list(range(shanties.shape[0]))
random.shuffle(random_range)
random_range[:10]

[835, 364, 336, 786, 368, 378, 637, 468, 4, 413]

In [8]:
# Creating column with random list and setting it to be the index
shanties['rand'] = random_range
shanties.set_index('rand', inplace=True)

In [9]:
# Reordering index
shanties.sort_index(inplace=True)

# Setting Index to title and dropping other columns
# shanties.set_index('title', inplace=True)
shanties.drop(columns=['Unnamed: 0'], axis=0, inplace=True)

In [10]:
# Removing all punctuation and special characters from both titles and lyrics

# Function to remomve all but letters and spaces and lowercase all remaining characters
def formatter(string):
    string = re.sub('[^A-Za-z ]', ' ', string)
    string = re.sub('\s+', ' ', string).strip().lower()
    return string

In [11]:
# Applying formatter to titles and lyrics
shanties['title'] = [formatter(string) for string in shanties['title']]
shanties['lyrics'] = [formatter(string) for string in shanties['lyrics']]
shanties.head()

Unnamed: 0_level_0,title,lyrics
rand,Unnamed: 1_level_1,Unnamed: 2_level_1
0,three score and ten,refrain and it s three score and ten boys and ...
1,bully in the alley,help me bob i m bully in the alleywey hey bull...
2,farewell to tarwathie,farewell to tarwathie adieu mormond hill and t...
3,captain tall,captain tall best man i met lively lads look l...
4,the dreadnought,there s a saucy wild packet and a packet of fa...


In [12]:
# Checking shape before dropping duplicates
shanties.shape

(840, 2)

In [13]:
# Dropping duplicate songs by title
shanties.drop_duplicates(subset='title', inplace=True)
shanties.shape

(605, 2)

In [14]:
# Resetting Index
shanties.reset_index(inplace=True)
shanties.drop(columns='rand', inplace=True)
shanties.shape

(605, 2)

In [15]:
# Some song lyrics pages only had a parts of a song or nothing.
# Searing for very shorty lyrics and dropping them

shanties.drop(shanties.loc[shanties['lyrics'].str.len() <= 50].index, axis=0, inplace=True)
shanties.shape

(602, 2)

In [16]:
# Exporting to csv
shanties.to_csv('../../data/shanties_df.csv')

In [18]:
# Converting all lyrics into one large text corpus
with open('../../data/shanties_all.txt', 'w') as f:
    f.write(' '.join(shanties['lyrics'].tolist()))