# Collecting, Organizing & Building my Corpus
## Jessica Pasik

## Setup

In [None]:
#Additional modules
import os
import pandas as pd
import re
import json
import requests
from bs4 import BeautifulSoup
import lyricsgenius
from collections import Counter
import nltk
from nltk import Text
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
sect_stoppers = ['pre-chorus','refrain','chorus','verse','intro','outro','bridge','verse 1','verse 2','verse 3','verse 4','1','2','3','4','Tim McGraw','Faith Hill','Tim McGraw & Faith Hill']
for x in sect_stoppers:
    stop_words.append(x)
# pos tagging
from nltk import pos_tag, pos_tag_sents, FreqDist, ConditionalFreqDist

## Functions

In [2]:
%run functions.ipynb

In [3]:
# Checking stop_words 
stop_words

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

## Description 

This notebook will detail the processes underwent to collect, organize, and analyze the lyrics of top country music songs by male and female artists of the 1990s and 2010s. By analyzing songs from different time periods and from different decades, I hope to determine any significant similarities and/or differences in lyrics . Given that there was no pre-existing chart that sufficiently organized the exact data I wished to analyze, I created an Excel chart that scraped the decade, song title, artist(s), and gender for each song. Each song that was a part of this list was either labeled a top song for that respective time period by reputable news sources/websites, nominated for an award, or actually crowned a winning song. The chart also contained a diversity of artists to hopefully capture an array of lyrical topics/content. Once this chart was finalized, I then uploaded it to JupyterHub and transformed the rows of the data frame into separate dictionaries. I then used Genius API to pull the lyrics for all relevant songs, which then allowed me to further separate and organize my data. The corpus pulls lyrical data from a combined 80 songs (40 per decade; 20 per gender per decade)

In [4]:
pwd

'/Commjhub/jupyterhub/comm318_fall2019/jpasik123/comm313_S21/comm313_S21_Final_Project/data_analysis'

In [5]:
ls -la

total 305
drwxr-xr-x. 1 jpasik123 jpasik123      0 May  7 12:10 [0m[01;34m.[0m/
drwxr-xr-x. 1 jpasik123 jpasik123      0 May  7 09:33 [01;34m..[0m/
-rw-r--r--. 1 jpasik123 jpasik123 144166 May  3 16:28 Building_and_organizing_corpus.ipynb
-rw-r--r--. 1 jpasik123 jpasik123  70882 May  6 17:57 frequency_ngram_analysis.ipynb
-rw-r--r--. 1 jpasik123 jpasik123  10829 May  7 12:10 functions.ipynb
drwxr-xr-x. 1 jpasik123 jpasik123      0 May  3 13:38 [01;34m.ipynb_checkpoints[0m/
-rw-r--r--. 1 jpasik123 jpasik123  74861 May  7 10:11 keyness_analysis.ipynb
-rw-r--r--. 1 jpasik123 jpasik123   7345 May  7 12:10 kwic_collocation_analysis.ipynb
-rw-r--r--. 1 jpasik123 jpasik123    555 May  6 19:33 partofspeech_tagging_analysis.ipynb
-rw-r--r--. 1 jpasik123 jpasik123   1327 May  6 19:24 README.md


In [6]:
raw_song_df = pd.read_csv('../data/JBPFinalProjectData.csv')
raw_song_df

Unnamed: 0,Decade,Title,Artist,Gender,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11
0,1990s,She's In Love With the Boy,Trisha Yearwood,Female,,,,,,,,
1,1990s,Fancy,Reba McEntire,Female,,,,,,,,
2,1990s,How Do I Live,LeAnn Rimes,Female,,,,,,,,
3,1990s,Independence Day,Martina McBride,Female,,,,,,,,
4,1990s,Man! I Feel Like A Woman!,Shania Twain,Female,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
75,2010s,Drunk on a Plane,Dierks Bentley,Male,,,,,,,,
76,2010s,A Man Don't Have to Die,Brad Paisley,Male,,,,,,,,
77,2010s,Wagon Wheel,Darius Rucker,Male,,,,,,,,
78,2010s,Broken Halos,Chris Stapleton,Male,,,,,,,,


In [7]:
song_df = raw_song_df[["Decade","Title","Artist","Gender"]]
song_df

Unnamed: 0,Decade,Title,Artist,Gender
0,1990s,She's In Love With the Boy,Trisha Yearwood,Female
1,1990s,Fancy,Reba McEntire,Female
2,1990s,How Do I Live,LeAnn Rimes,Female
3,1990s,Independence Day,Martina McBride,Female
4,1990s,Man! I Feel Like A Woman!,Shania Twain,Female
...,...,...,...,...
75,2010s,Drunk on a Plane,Dierks Bentley,Male
76,2010s,A Man Don't Have to Die,Brad Paisley,Male
77,2010s,Wagon Wheel,Darius Rucker,Male
78,2010s,Broken Halos,Chris Stapleton,Male


In [8]:
## making every row of data frame into a dictionary 
all_songs = song_df.to_dict(orient = "records")

In [9]:
all_songs[0]

{'Artist': 'Trisha Yearwood',
 'Decade': '1990s',
 'Gender': 'Female',
 'Title': "She's In Love With the Boy"}

In [10]:
all_songs[27]

{'Artist': 'Garth Brooks',
 'Decade': '1990s',
 'Gender': 'Male',
 'Title': 'Friends In Low Places'}

In [11]:
all_songs[17]['Artist']

'Faith Hill'

In [12]:
## generating Genius API to retrieve lyrics
MY_CLIENT_ACCESS_TOKEN = 'Ree8tGElpodGp9I2xVS4UUXwKms7dovRBl7X4boLd2OHDUqV7Io9f7Zn_MHPuO14'
genius = lyricsgenius.Genius(MY_CLIENT_ACCESS_TOKEN)

For each item in your list `all_songs`:
    * Use the Genius API `search_song` function to get the matching song object, e.g.
        * `song = genius.search_song((song['Title'],song['Artist']))`
    * If there is a match add a key-value pair to the dictionary with:
        * a key called `lyrics`
        * where the value is the `str` value for the lyrics, e.g. `song['lyrics'] = song_API.lyrics`
    * If no matching song is found then add the current item to the `error_processing` list

In [13]:
error_processing = []

for song in all_songs:
    song_artist = song['Artist']
    song_title = song['Title'] 
    if song.get('lyrics'):
        continue
    else:
        song_API = genius.search_song(song_title,song_artist) #Query Genius for song and return lyrics
    if song:
        song['lyrics'] = song_API.lyrics #Update the current song dictionary with the lyrics
    else: error_processing.append(song)
          
if len(error_processing)>0:
    print('There were {} items in your chart that genius.com could not find'.format(len(error_processing)))
    
#If no results are returned from Genius, add the current item to the `error_processing` list

Searching for "She's In Love With the Boy" by Trisha Yearwood...
Done.
Searching for "Fancy" by Reba McEntire...
Done.
Searching for "How Do I Live" by LeAnn Rimes...
Done.
Searching for "Independence Day" by Martina McBride...
Done.
Searching for "Man! I Feel Like A Woman!" by Shania Twain...
Done.
Searching for "When You Say Nothing At All" by Alison Krauss (& Union Station)...
Done.
Searching for "Wide Open Spaces" by Dixie Chicks...
Done.
Searching for "It Matters To Me" by Faith Hill...
Done.
Searching for "Little Good-Byes" by SHeDAISY...
Done.
Searching for "Single White Female" by Chely Wright...
Done.
Searching for "Strawberry Wine" by Deana Carter...
Done.
Searching for "Maybe It Was Memphis" by Pam Tillis...
Done.
Searching for "Watermelon Crawl" by Tracy Byrd...
Done.
Searching for "I Try to Think About Elvis" by Patty Loveless...
Done.
Searching for "Heads Carolina, Tails California" by Jo Dee Messina...
Done.
Searching for "Alibis" by Tracy Lawrence...
Done.
Searching for

In [14]:
#create 4 empty lists (gender-decades)
#loop over each item in all songs 
#test the decade and gender values 

female_90s = []
male_90s = []
female_2010s = []
male_2010s = []

for song in all_songs:
    if song['Decade'] == '1990s' and song['Gender'] == 'Female':
        female_90s.append(song)
    if song['Decade'] == '1990s' and song['Gender'] == 'Male':
        male_90s.append(song)
    if song['Decade'] == '2010s' and song['Gender'] == 'Female':
        female_2010s.append(song)
    if song['Decade'] == '2010s' and song['Gender'] == 'Male':
        male_2010s.append(song)

In [15]:
female_90s

[{'Artist': 'Trisha Yearwood',
  'Decade': '1990s',
  'Gender': 'Female',
  'Title': "She's In Love With the Boy",
  'lyrics': "[Verse 1]\nKatie's sittin' on her old front porch\nWatchin' the chickens peck the ground\nThere ain't a whole lot goin' on tonight\nIn this one-horse town\n\n[Verse 2]\nOver yonder comin' up the road\nIn a beat-up Chevy truck\nHer boyfriend Tommy\nHe's layin' on the horn\nSplashin' through the mud and the muck\n\n[Bridge]\nHer daddy says he ain't worth a lick\nWhen it comes to brains\nHe got the short end of the stick\nBut Katie's young and man\nShe just don't care\nShe'd follow Tommy anywhere\n\n[Chorus]\nShe's in love with the boy\nShe's in love with the boy\nShe's in love with the boy\nAnd even if they have to run away\nShe's gonna marry that boy someday\n\n[Verse 3]\nKatie and Tommy at the drive-in movie\nParked in the very last row\nThey're too busy holdin' on to one another\nTo even care about the show\n\n[Verse 4]\nLater on outside the Tastee Freeze\nTo

In [16]:
male_2010s

[{'Artist': 'Luke Bryan',
  'Decade': '2010s',
  'Gender': 'Male',
  'Title': 'Drink a Beer',
  'lyrics': "[Verse 1]\nWhen I got the news today\nI didn't know what to say\nSo I just hung up the phone\nI took a walk to clear my head\nThis is where the walking lead\nCan't believe you're really gone\nDon't feel like going home\n\n[Chorus]\nSo I'm gonna sit right here\nOn the edge of this pier\nWatch the sunset disappear\nAnd drink a beer\n\n[Verse 2]\nFunny how the good ones go\nToo soon, but the good Lord knows\nThe reasons why, I guess\nSometimes the greater plan\nIs kinda hard to understand\nRight now it don't make sense\nI can't make it all make sense\n\n[Chorus]\nSo I'm gonna sit right here\nOn the edge of this pier\nWatch the sunset disappear\nAnd drink a beer\n\n[Bridge]\nSo long my friend\n'till we meet again\nI'll remember you\nAnd all the times that we used to...\n\n[Chorus]\nSit right here\nOn the edge of this pier\nWatch the sunset disappear\nAnd drink a beer\nDrink a beer, ye

In [17]:
#Filtering all 1990s songs
filter_90s = song_df['Decade'] == '1990s'
song_df[filter_90s]

Unnamed: 0,Decade,Title,Artist,Gender
0,1990s,She's In Love With the Boy,Trisha Yearwood,Female
1,1990s,Fancy,Reba McEntire,Female
2,1990s,How Do I Live,LeAnn Rimes,Female
3,1990s,Independence Day,Martina McBride,Female
4,1990s,Man! I Feel Like A Woman!,Shania Twain,Female
5,1990s,When You Say Nothing At All,Alison Krauss (& Union Station),Female
6,1990s,Wide Open Spaces,Dixie Chicks,Female
7,1990s,It Matters To Me,Faith Hill,Female
8,1990s,Little Good-Byes,SHeDAISY,Female
9,1990s,Single White Female,Chely Wright,Female


In [18]:
#Filtering all 2010s songs
filter_2010s = song_df['Decade'] == '2010s'
song_df[filter_2010s]

Unnamed: 0,Decade,Title,Artist,Gender
40,2010s,Peter Pan,Kelsea Ballerini,Female
41,2010s,Blown Away,Carrie Underwood,Female
42,2010s,Mine,Taylor Swift,Female
43,2010s,Heart Like Mine,Miranda Lambert,Female
44,2010s,Burning House,Cam,Female
45,2010s,My Church,Maren Morris,Female
46,2010s,The Joke,Brandi Carlile,Female
47,2010s,Girl Goin' Nowhere,Ashley McBryde,Female
48,2010s,Follow Your Arrow,Kacey Musgraves,Female
49,2010s,Road Less Traveled,Lauren Alaina,Female


In [19]:
#Creating decade combined m/f analysis

all_90s = []
all_2010s = []

for song in all_songs:
    if song['Decade'] == '1990s':
        all_90s.append(song)
    if song['Decade'] == '2010s':
        all_2010s.append(song)

In [20]:
len(all_90s) # showing 40 songs total for that decade 

40

In [21]:
len(all_2010s) # showing 40 songs total for that decade 

40

In [22]:
## creating m/f analysis 

all_female = []
all_male = []

for song in all_songs:
    if song['Gender'] == 'Female':
        all_female.append(song)
    if song['Gender'] == 'Male':
        all_male.append(song)

In [23]:
len(all_female)

40

In [24]:
len(all_male)

40

In [25]:
all_female[1]['Title']

'Fancy'

Below is a sequence of code that is saving all the filtered charts created above as JSON files. 

In [26]:
char_to_strip = '.,!][?;$"-()'

In [47]:
## function that removes lines with section markers from lyrics text 
def strip_section_markers(text):
    filtered_lines = []
    lines = text.split('\n')
    for line in lines:
        if not line.startswith('['):
            filtered_lines.append(line)
    new_text = '\n'.join(filtered_lines)
    return new_text

In [48]:
# process_chart is function to tokenize lyrics 
def process_chart(songs):
    for song in songs:
        str_stripped = strip_section_markers(text=song['lyrics'])
        toks = tokenize(str_stripped, lowercase=True, strip_chars=char_to_strip)
        song['tokens'] = toks

In [49]:
# Saving `all_songs` as a JSON file
with open('../data/charts/all_songs.json', 'w') as out:
    out.write(json.dumps(all_songs, indent=4))

In [50]:
with open('../data/charts/all_90s.json', 'w') as out:
    out.write(json.dumps(all_90s, indent=4))

In [51]:
with open('../data/charts/all_2010s.json', 'w') as out:
    out.write(json.dumps(all_2010s, indent=4))

In [52]:
with open('../data/charts/all_female.json', 'w') as out:
    out.write(json.dumps(all_female, indent=4))

In [53]:
with open('../data/charts/all_male.json', 'w') as out:
    out.write(json.dumps(all_male, indent=4))

In [54]:
with open('../data/charts/female_90s.json', 'w') as out:
    out.write(json.dumps(female_90s, indent=4))

In [55]:
with open('../data/charts/male_90s.json', 'w') as out:
    out.write(json.dumps(male_90s, indent=4))

In [56]:
with open('../data/charts/female_2010s.json', 'w') as out:
    out.write(json.dumps(female_2010s, indent=4))

In [57]:
with open('../data/charts/male_2010s.json', 'w') as out:
    out.write(json.dumps(male_2010s , indent=4))

In [66]:
#Pointing to list of pilot data files (all the JSON files mentioned above)
chart_files = [f for f in os.listdir('../data/charts') if f.endswith('.json')]

In [67]:
#Creating mass dictionary that runs through all files 
chart_dict = {}
for chart in chart_files:
      chart_dict[chart[:-5]] = json.load(open('../data/charts/{}'.format(chart)))

In [68]:
# loading data from each JSON file (see below)
# processing all charts
for chart in chart_dict:
      process_chart(chart_dict[chart])

In [69]:
with open('../data/charts/all_charts.json', 'w') as out:
    out.write(json.dumps(chart_dict, indent=4))

In [70]:
## finding length of each key in the dictionaries 
for chart,value in chart_dict.items():
    print(chart,len(value))

all_2010s 40
all_90s 40
all_female 40
all_male 40
all_songs 80
female_2010s 20
female_90s 20
male_2010s 20
male_90s 20


Now, I am going to run code that will clean up the lyrical data and make it easier for analysis. 

In [71]:
chart_dict['all_songs'][0].keys()

dict_keys(['Decade', 'Title', 'Artist', 'Gender', 'lyrics', 'tokens'])

In [72]:
chart_dict['all_songs'][0]['tokens']

["katie's",
 "sittin'",
 'on',
 'her',
 'old',
 'front',
 'porch',
 "watchin'",
 'the',
 'chickens',
 'peck',
 'the',
 'ground',
 'there',
 "ain't",
 'a',
 'whole',
 'lot',
 "goin'",
 'on',
 'tonight',
 'in',
 'this',
 'onehorse',
 'town',
 'over',
 'yonder',
 "comin'",
 'up',
 'the',
 'road',
 'in',
 'a',
 'beatup',
 'chevy',
 'truck',
 'her',
 'boyfriend',
 'tommy',
 "he's",
 "layin'",
 'on',
 'the',
 'horn',
 "splashin'",
 'through',
 'the',
 'mud',
 'and',
 'the',
 'muck',
 'her',
 'daddy',
 'says',
 'he',
 "ain't",
 'worth',
 'a',
 'lick',
 'when',
 'it',
 'comes',
 'to',
 'brains',
 'he',
 'got',
 'the',
 'short',
 'end',
 'of',
 'the',
 'stick',
 'but',
 "katie's",
 'young',
 'and',
 'man',
 'she',
 'just',
 "don't",
 'care',
 "she'd",
 'follow',
 'tommy',
 'anywhere',
 "she's",
 'in',
 'love',
 'with',
 'the',
 'boy',
 "she's",
 'in',
 'love',
 'with',
 'the',
 'boy',
 "she's",
 'in',
 'love',
 'with',
 'the',
 'boy',
 'and',
 'even',
 'if',
 'they',
 'have',
 'to',
 'run',
 'a

In [73]:
len(chart_dict['female_90s']) ##length of each chart is 20 songs (per decade/gender)

20

In [74]:
len(chart_dict['male_90s'])##length of each chart is 20 songs (per decade/gender)

20

In [75]:
len(chart_dict['male_2010s']) ##length of each chart is 20 songs (per decade/gender)

20

In [76]:
len(chart_dict['female_2010s']) ##length of each chart is 20 songs (per decade/gender)

20

In [77]:
len(chart_dict['all_female'])##length of all female songs 

40

In [78]:
len(chart_dict['all_male']) ## length of all male songs 

40

In this notebook, I have printed every step I took to collect, organize and build my corpus for this final project. I hope you found this helpful in giving some background and insight on my thought processes and coding efforts when working on this project!