# Dolly Parton NLP Exploratory Data Analysis

In [55]:
# project setup

import os
import re
import pandas as pd

import spacy
import nltk
import seaborn as sns

from spacy.lang.en import English

NLP = English()

In [None]:
EXCLUDE_WORDS = ['Dolly Parton', 'Frank Daycus', 'Rachel Dennison', \
'Randy Parton', 'Jimmie Rodgers', 'Bill Owens', 'George Morgan', 'Ernie Ford', \
'O\'Hearn', 'Jean Ritchie', 'Music Pub Co.', 'verse', 'chorus', '©', \
 'copyright', 'Myra Brooks Welch', 'R E S P O N S I B I L I T Y']

In [53]:
# import data

path = '/Users/laraehrenhofer/Documents/Coding_Projects/git_repos/dolly-parton-song-generator/corpus/dolly-parton'

files = os.listdir(path)
files = [file for file in files if file.startswith('dolly-parton_')]

songnames = [re.findall(r'(\w+).txt', file) for file in files if file.startswith('dolly-parton_')]

lyrics = []
for file in files:
#    print(f'{path}/{file}')
    with open(f'{path}/{file}') as f:
        text = f.read()
        lyrics.append(text)

## Cleanup

In [56]:
def filter_lyrics(line):
    '''
    Marks lines containing stage directions like "[chorus]" or noting the name of the songwriter
    '''
    if '[' in line or any(word in line for word in EXCLUDE_WORDS) or line == [' ']:
        return False
    else:
        return True

In [57]:
def clean_lyrics(raw):
    '''
    Text preprocessing
    1. Replace newlines with spaces
    2. lowercase everything
    3. tokenise into list of individual words
    '''
    doc = NLP(raw)
    lyrics = [token.orth_.lower() for token in doc if not token.is_punct]
    lyrics = [item for item in lyrics if item != ' ']
    return lyrics

In [None]:
raw = [line.strip() for line in f.read().split('\n') if filter_lyrics(line) and len(line) > 0]
raw = ' '.join(raw)

## Basic questions

First step of exploration:
- How many songs are in this corpus?
- How many words are in each song?
- What is the distribution of song lengths?

In [48]:
lengths = [len(song) for song in lyrics]

In [49]:
lyrics

["Send me the pillow that you dream on Don't you know that I still care for you? Send me the pillow that you dream on So darling, I can dream on it too  Each night while I'm sleeping, oh so lonely I share your love in dreams that once were true Send me the pillow that you dream on So darling, I can dream on it too  I've waited so long for you to write me But just a memory's all that's left of you Send me the pillow that you dream on So darling, I can dream on it too So darling, I can dream on it too",
 'Bows and flows of angel hair and ice cream castles in the air And feather canyons everywhere, I\'ve looked at clouds that way But now they only block the sun they rain and snow on everyone So many things I would have done, but clouds got in my way  I\'ve looked at clouds from both sides now From up and down and still somehow It\'s cloud\'s illusions I recall I really don\'t know clouds at all  Moons and Junes and Ferris wheels the dizzy dancing way you feel As every fairy tale comes rea