In [1]:
# Imports
import pandas as pd
import numpy as np
import re
import random

# Options
random_seed=42

In [2]:
# Creating dataframes from each scrapers csv output
# /Users/jacobellena/Google-Drive/dsi/submissions/capstone/data/contemplator_songs.csv
df1 = pd.read_csv('../../data/contemplator_songs.csv')
df2 = pd.read_csv('../../data/karaoke_lyrics_songs.csv')
df3 = pd.read_csv('../../data/sailor_songs_songs.csv')
df4 = pd.read_csv('../../data/shanty_net_songs.csv')
df5 = pd.read_csv('../../data/traditional_music_songs.csv')

In [3]:
# Creating master dataframe
dataframes = [df1, df2, df3, df4, df5]

shanties = pd.concat(dataframes, axis=0)
shanties.head()

Unnamed: 0.1,Unnamed: 0,title,lyrics
0,0,A Hundred Years Ago,"A hundred years is a very long time,\r\nHo, ye..."
1,1,"Aweigh, Santy Ano","From Boston Town we're bound away,\r\nHeave aw..."
2,2,The Black Ball Line,In the Black Ball Line I serv'd my time\r\nHur...
3,3,Blow the Man Down,"Come all ye young fellows that follow the sea,..."
4,4,"Blow Boys, Blow","A Yankee ship came down the river\r\nBlow, boy..."


In [4]:
# Checking shape
shanties.shape

(846, 3)

In [5]:
# Some lyrics pages were empty so checking for nulls
shanties.isnull().sum()

Unnamed: 0    0
title         0
lyrics        6
dtype: int64

In [6]:
# Dropping nulls
shanties.dropna(inplace=True)
shanties.shape

(840, 3)

In [7]:
# Cleaning index
# Duplicate songs will be dropped by name. The first duplicated song will be dropped
# In order to limit bias the dataframe will be reindexed with a random number

# Creating a random list 
random_range = list(range(shanties.shape[0]))
random.shuffle(random_range)
random_range[:10]

[253, 215, 545, 838, 211, 184, 630, 701, 739, 788]

In [8]:
# Creating column with random list and setting it to be the index
shanties['rand'] = random_range
shanties.set_index('rand', inplace=True)

In [9]:
# Reordering index
shanties.sort_index(inplace=True)

# Setting Index to title and dropping other columns
# shanties.set_index('title', inplace=True)
shanties.drop(columns=['Unnamed: 0'], axis=0, inplace=True)

In [10]:
# Removing all punctuation and special characters from both titles and lyrics

# Function to remomve all but letters and spaces and lowercase all remaining characters
def formatter(string):
    string = re.sub('[^A-Za-z ]', ' ', string)
    string = re.sub('\s+', ' ', string).strip().lower()
    return string

In [11]:
# Applying formatter to titles and lyrics
shanties['title'] = [formatter(string) for string in shanties['title']]
shanties['lyrics'] = [formatter(string) for string in shanties['lyrics']]
shanties.head()

Unnamed: 0_level_0,title,lyrics
rand,Unnamed: 1_level_1,Unnamed: 2_level_1
0,les filles de la rochelle,ah la feuille s envole s envole ah la feuille ...
1,auckland to the bluff,i left the city when just a lad times were har...
2,the cumberland and the merrimack,come all my jolly seamen likewise you landsmen...
3,maid of amsterdam a rovin,in amsterdam i met a maidmark well what i do s...
4,the sailor s loves,the maiden oh the maiden oh the sailor loves t...


In [12]:
# Dropping duplicate songs by title
print(f'Rows before dropping duplicates: {shanties.shape[0]}')
shanties.drop_duplicates(subset='title', inplace=True)
print(f'Rows after dropping duplicates:  {shanties.shape[0]}')

Rows before dropping duplicates: 840
Rows after dropping duplicates:  605


In [13]:
# Resetting Index
shanties.reset_index(inplace=True)
shanties.drop(columns='rand', inplace=True)
shanties.shape

(605, 2)

In [14]:
# Some song lyrics pages only had a parts of a song or nothing.
# Searing for very shorty lyrics and dropping them

shanties.drop(shanties.loc[shanties['lyrics'].str.len() <= 50].index, axis=0, inplace=True)
shanties.shape

(602, 2)

In [15]:
# Exporting to csv
shanties.to_csv('../../data/shanties_df.csv')

In [16]:
# Converting all lyrics into one large text corpus
with open('../../data/shanties_all.txt', 'w') as f:
    f.write(' '.join(shanties['lyrics'].tolist()))