## Part-of-Speech Analysis

### In this notebook, you will find:
- Loaded corpora from JSON files of various song dictionaries 
- Detailed text analysis of lyrics, separated by section headers; since part-of-speech tagging does not play a primary role in my analysis, I have decided to only include code for the comprehensive data charts for each decade and gender
- Part-of-speech analysis is used to evaluate the distribution of lyrics as nouns, adjectives and verbs

In [11]:
%run functions.ipynb

In [28]:
%run frequency_ngram_analysis.ipynb

[nltk_data] Downloading package stopwords to /Commjhub/jupyterhub/comm
[nltk_data]     318_fall2019/jpasik123/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 50 words in your `all_90s` corpus
[('love', 108), ("i'm", 75), ('know', 74), ('oh', 70), ('yeah', 56), ('like', 55), ('got', 50), ('get', 47), ('let', 47), ('little', 45), ('wanna', 44), ("ain't", 42), ('one', 38), ('never', 38), ('way', 37), ('tell', 36), ('girl', 35), ("i've", 35), ('take', 34), ('ya', 34), ('come', 33), ('say', 33), ('go', 33), ('boy', 32), ('said', 32), ('make', 32), ('baby', 30), ('heart', 30), ('away', 29), ('gonna', 28), ('think', 28), ('well', 27), ('right', 27), ('maria', 27), ('time', 26), ('man', 25), ('ever', 25), ('feel', 25), ('want', 24), ('night', 22), ('would', 22), ('knows', 22), ('world', 21), ('hey', 21), ("can't", 21), ('kiss', 21), ('maybe', 19), ('really', 19), ('passionate', 19), ('kisses', 19)]
Top 50 # of songs each type occurs in your `all_90s` corpus
[('know', 22), ("i'm", 21), ('like', 20), ('love', 19), ('oh', 19), ('night', 19), ('one', 18), ('got', 18), ('said', 17), ('never', 17), ('get', 16), ("ain't", 15), ('right', 15), ("i've", 

Top 50 words in your `all_male` corpus
[("i'm", 114), ('like', 98), ('love', 78), ('know', 70), ('yeah', 60), ('got', 57), ('wanna', 57), ('get', 55), ("ain't", 55), ('oh', 51), ('baby', 51), ('ya', 48), ('back', 46), ('make', 44), ('little', 43), ('girl', 40), ('never', 39), ('think', 38), ('take', 37), ('right', 37), ('oooh', 36), ('one', 35), ('see', 34), ('heart', 33), ('go', 33), ('gonna', 33), ('come', 32), ('tell', 31), ('rock', 30), ('night', 29), ("can't", 29), ("i've", 28), ('way', 28), ('mama', 28), ("'em", 28), ('need', 28), ('maria', 27), ('hey', 26), ('say', 25), ('man', 25), ('around', 25), ("i'll", 24), ('whiskey', 23), ('world', 23), ('beautiful', 23), ('let', 22), ('away', 21), ('time', 21), ('good', 21), ('would', 20)]
Top 50 # of songs each type occurs in your `all_male` corpus
[('know', 24), ("i'm", 24), ('like', 22), ('night', 19), ('get', 19), ('got', 19), ('go', 18), ('never', 17), ('yeah', 17), ("ain't", 17), ('right', 16), ('love', 15), ('oh', 14), ('take', 14

Top 50 words in your `male_2010s` corpus
[('like', 72), ("i'm", 57), ('back', 44), ('baby', 40), ('oooh', 36), ('yeah', 31), ('need', 28), ('rock', 28), ('got', 27), ('right', 27), ("ain't", 27), ('little', 26), ("'em", 26), ('mama', 26), ('make', 25), ('think', 25), ('know', 24), ('get', 24), ('gonna', 23), ('go', 23), ('wanna', 23), ('see', 21), ('hey', 20), ('dirt', 19), ('one', 19), ('good', 18), ('way', 17), ('whiskey', 17), ('used', 16), ('road', 16), ("i'ma", 16), ('tequila', 16), ('around', 15), ('night', 15), ('hell', 15), ('free', 15), ('drink', 14), ('take', 14), ("'cause", 14), ('man', 14), ('ya', 14), ('crazy', 14), ('always', 14), ('country', 13), ('never', 13), ('glasses', 13), ('drunk', 13), ("can't", 12), ('feel', 12), ('shine', 12)]
Top 50 # of songs each type occurs in your `male_2010s` corpus
[('go', 12), ('like', 12), ('back', 12), ("i'm", 11), ('yeah', 11), ('know', 11), ('get', 11), ("ain't", 10), ('got', 9), ('baby', 9), ('right', 8), ('take', 8), ('way', 8), ('

## Additional Modules

In [12]:
#Additional modules
import os
import pandas as pd
import re
import json
import requests
from bs4 import BeautifulSoup
import lyricsgenius
from collections import Counter
import nltk
from nltk import Text
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
sect_stoppers = ['pre-chorus','refrain','chorus','verse','intro','outro','bridge','verse 1','verse 2','verse 3','verse 4','1','2','3','4','Tim McGraw','Faith Hill','Tim McGraw & Faith Hill']
for x in sect_stoppers:
    stop_words.append(x)
# pos tagging
from nltk import pos_tag, pos_tag_sents, FreqDist, ConditionalFreqDist

[nltk_data] Downloading package stopwords to /Commjhub/jupyterhub/comm
[nltk_data]     318_fall2019/jpasik123/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
char_to_strip = '.,!][?;$"-()'

In [14]:
all_charts = json.load(open('../data/charts/all_charts.json'))

## Part of Speech Tagging 

## All 2010s

In [16]:
# tagging `all_2010s`

all_2010s_tagged = []

for song in range(len(all_charts['all_2010s'])):
    txt = tokenize((all_charts['all_2010s'][song]['lyrics']), lowercase=True, strip_chars=char_to_strip)
    all_2010s_tagged.append(nltk.pos_tag(txt))

### Verbs

In [17]:
verb_2010s = []
for song in range(len(all_2010s_tagged)):
    for tags in range(len(all_2010s_tagged[song])):
        if all_2010s_tagged[song][tags][1].startswith('V'):
            verb_2010s.append(all_2010s_tagged[song][tags][0])

In [18]:
# all 2010s verb frequency
verb_2010s_freq = Counter(verb_2010s)
verb_2010s_freq.most_common(20)

[('i', 77),
 ('be', 73),
 ('get', 60),
 ('got', 57),
 ('know', 46),
 ('is', 46),
 ("don't", 45),
 ('take', 40),
 ("ain't", 36),
 ('see', 34),
 ('make', 34),
 ('go', 34),
 ('have', 32),
 ('hope', 32),
 ('think', 31),
 ('been', 30),
 ('need', 30),
 ("i'm", 29),
 ("it's", 27),
 ('do', 27)]

### Adjectives

In [19]:
# picking adjectives for 2010s
adj_2010s = []
for song in range(len(all_2010s_tagged)):
    for tags in range(len(all_2010s_tagged[song])):
        if all_2010s_tagged[song][tags][1].startswith('J'):
            adj_2010s.append(all_2010s_tagged[song][tags][0])

In [20]:
# all 2010s adj frequency
adj_2010s_freq = Counter(adj_2010s)
adj_2010s_freq.most_common(20)

[('i', 94),
 ('little', 62),
 ("i'm", 35),
 ('good', 33),
 ('free', 22),
 ('same', 18),
 ('whiskey', 16),
 ('stronger', 16),
 ('wanna', 16),
 ('old', 15),
 ("i'll", 15),
 ('verse', 14),
 ("don't", 14),
 ('less', 14),
 ('oh', 14),
 ('more', 14),
 ('high', 14),
 ('yeah', 13),
 ('new', 13),
 ('afraid', 12)]

### Observations:

- Interesting combination of verbs: "make", "take", "hope", "think", "have", "need"
- More variability with adjectives: "little", "stronger", "whiskey", "high"

In [29]:
# what percentage of all tokens in 2010s corpus are verbs/adjectives
## divide sum of each counter by sum of total tokens counter

verb_2010s_perc = (sum(verb_2010s_freq.values()) / sum(word_freq_2010s.values()))* 100
adj_2010s_perc = (sum(adj_2010s_freq.values()) / sum(word_freq_2010s.values()))* 100

print('{} percent of the tokens in all 2010s songs were verbs'.format(verb_2010s_perc ))
print('{} percent of the tokens in all 2010s songs were adjectives'.format(adj_2010s_perc))


39.8196270253745 percent of the tokens in all 2010s songs were verbs
16.70742892081932 percent of the tokens in all 2010s songs were adjectives


## All 1990s 

In [22]:
all_90s_tagged = []

for song in range(len(all_charts['all_90s'])):
    txt_90s = tokenize((all_charts['all_90s'][song]['lyrics']), lowercase=True, strip_chars=char_to_strip)
    all_90s_tagged.append(nltk.pos_tag(txt_90s))
    

### Verbs

In [23]:
verb_90s = []
for song in range(len(all_90s_tagged)):
    for tags in range(len(all_90s_tagged[song])):
        if all_90s_tagged[song][tags][1].startswith('V'):
            verb_90s.append(all_90s_tagged[song][tags][0])


In [24]:
verb_90s_freq = Counter(verb_90s)
verb_90s_freq.most_common(20)

[('i', 112),
 ('do', 75),
 ('was', 75),
 ('know', 70),
 ('be', 69),
 ('love', 60),
 ("don't", 52),
 ('got', 50),
 ('is', 49),
 ('get', 43),
 ('let', 42),
 ('have', 36),
 ('been', 36),
 ("it's", 34),
 ('say', 33),
 ('take', 33),
 ('go', 33),
 ('come', 32),
 ('said', 32),
 ('tell', 32)]

### Adjectives 

In [25]:
adj_90s = []
for song in range(len(all_90s_tagged)):
    for tags in range(len(all_90s_tagged[song])):
        if all_90s_tagged[song][tags][1].startswith('J'):
            adj_90s.append(all_90s_tagged[song][tags][0])



In [26]:
adj_90s_freq = Counter(adj_90s)
adj_90s_freq.most_common(20)

[('i', 84),
 ('little', 45),
 ('oh', 22),
 ("she's", 17),
 ('right', 15),
 ('beautiful', 15),
 ('verse', 14),
 ('best', 13),
 ('goodbyes', 13),
 ('last', 12),
 ("it's", 12),
 ("i'm", 12),
 ('old', 11),
 ('good', 11),
 ('big', 11),
 ('sweet', 11),
 ('whole', 10),
 ('high', 10),
 ('wrong', 10),
 ('true', 10)]

### Observations:

- As expected, "love" occurred as a verb most frequently in the 1990s chart
- "Little" appeared as adjective in both the 2010s and 1990s charts
- More variety in adjectives from the 1990s: "beautiful", "old", "sweet", "wrong", "goodbyes", "best", "last"

In [30]:
## what percentage of all tokens in 90s corpus are verbs/adjectives
## divide sum of each counter by sum of total tokens counter

verb_90s_perc = (sum(verb_90s_freq.values()) / sum(word_freq_90s.values()))* 100
adj_90s_perc = (sum(adj_90s_freq.values()) / sum(word_freq_90s.values()))* 100

print('{} percent of the tokens in all 90s songs were verbs'.format(verb_90s_perc))
print('{} percent of the tokens in all 90s songs were adjectives'.format(adj_90s_perc))

42.479946524064175 percent of the tokens in all 90s songs were verbs
17.34625668449198 percent of the tokens in all 90s songs were adjectives


## All Female

In [31]:
all_female_tagged = []

for song in range(len(all_charts['all_female'])):
    txt_female = tokenize((all_charts['all_female'][song]['lyrics']), lowercase=True, strip_chars=char_to_strip)
    all_female_tagged.append(nltk.pos_tag(txt_female)) 

### Verbs 

In [33]:
verb_female = []
for song in range(len(all_female_tagged)):
    for tags in range(len(all_female_tagged[song])):
        if all_female_tagged[song][tags][1].startswith('V'):
            verb_female.append(all_female_tagged[song][tags][0])

In [36]:
verb_female_freq = Counter(verb_female)
verb_female_freq.most_common(20)

[('i', 102),
 ('be', 82),
 ('was', 66),
 ('do', 57),
 ('got', 50),
 ('know', 50),
 ('get', 48),
 ("don't", 42),
 ('have', 42),
 ('let', 39),
 ('take', 37),
 ('go', 35),
 ('said', 34),
 ('hope', 34),
 ('been', 33),
 ("it's", 32),
 ('is', 32),
 ("ain't", 29),
 ('had', 27),
 ('feel', 25)]

### Adjectives

In [35]:
adj_female = []
for song in range(len(all_female_tagged)):
    for tags in range(len(all_female_tagged[song])):
        if all_female_tagged[song][tags][1].startswith('J'):
            adj_female.append(all_female_tagged[song][tags][0])

In [37]:
adj_female_freq = Counter(adj_female)
adj_female_freq.most_common(20)

[('i', 101),
 ('little', 64),
 ('oh', 24),
 ('good', 23),
 ('best', 18),
 ('stronger', 16),
 ('high', 14),
 ('right', 14),
 ('yeah', 14),
 ('less', 14),
 ('last', 13),
 ('goodbyes', 13),
 ("i'm", 13),
 ("don't", 12),
 ('first', 12),
 ('verse', 11),
 ('wrong', 10),
 ('white', 10),
 ("it's", 10),
 ('free', 10)]

### Observations:

- Variety in adjectives in songs written by female artists
- No standout verb

In [45]:
verb_female_perc = (sum(verb_female_freq.values()) / sum(f_word_freq.values()))* 100
adj_female_perc = (sum(adj_female_freq.values()) / sum(f_word_freq.values()))* 100

print('{} percent of the tokens in all female songs were verbs'.format(verb_female_perc))
print('{} percent of the tokens in all female songs were adjectives'.format(adj_female_perc))

43.145624692168774 percent of the tokens in all female songs were verbs
17.271384009193895 percent of the tokens in all female songs were adjectives


## All Male

In [40]:
all_male_tagged = []

for song in range(len(all_charts['all_male'])):
    txt_male = tokenize((all_charts['all_male'][song]['lyrics']), lowercase=True, strip_chars=char_to_strip)
    all_male_tagged.append(nltk.pos_tag(txt_male)) 

### Verbs

In [41]:
verb_male = []
for song in range(len(all_male_tagged)):
    for tags in range(len(all_male_tagged[song])):
        if all_male_tagged[song][tags][1].startswith('V'):
            verb_male.append(all_male_tagged[song][tags][0])

In [42]:
verb_male_freq = Counter(verb_male)
verb_male_freq.most_common(20)

[('i', 87),
 ('know', 66),
 ('is', 63),
 ('be', 60),
 ('got', 57),
 ("don't", 55),
 ('get', 55),
 ('love', 48),
 ('do', 45),
 ('make', 44),
 ('think', 37),
 ('take', 36),
 ('been', 33),
 ('see', 33),
 ("ain't", 33),
 ('go', 32),
 ('come', 31),
 ('was', 30),
 ('tell', 30),
 ("i'm", 30)]

### Adjectives

In [43]:
adj_male = []
for song in range(len(all_male_tagged)):
    for tags in range(len(all_male_tagged[song])):
        if all_male_tagged[song][tags][1].startswith('J'):
            adj_male.append(all_male_tagged[song][tags][0])

In [44]:
adj_male_freq = Counter(adj_male)
adj_male_freq.most_common(20)

[('i', 77),
 ('little', 43),
 ("i'm", 34),
 ('good', 21),
 ('more', 21),
 ('old', 18),
 ('free', 18),
 ('verse', 17),
 ('same', 15),
 ("i'll", 15),
 ('beautiful', 15),
 ('wanna', 15),
 ('new', 14),
 ('whiskey', 14),
 ('oh', 12),
 ("don't", 11),
 ('true', 11),
 ('right', 10),
 ('high', 10),
 ("ain't", 10)]

### Observations:

- "Beautiful" and "whiskey" are top adjectives among songs written by male artists
- "Love" is standout verb for `all_male` chart

In [47]:
verb_male_perc = (sum(verb_male_freq.values()) / sum(m_word_freq.values()))* 100
adj_male_perc = (sum(adj_male_freq.values()) / sum(m_word_freq.values()))* 100

print('{} percent of the tokens in all male songs were verbs'.format(verb_male_perc))
print('{} percent of the tokens in all male songs were adjectives'.format(adj_male_perc))

39.14529914529915 percent of the tokens in all male songs were verbs
16.767676767676768 percent of the tokens in all male songs were adjectives
