In [None]:
# Web scraping, pickle imports
import requests
from bs4 import BeautifulSoup
import pickle
from urllib.request import urlopen
# Scrapes transcript data from songlyrics.com
def url_to_transcript(url):
    '''Returns transcript data specifically from songlyrics.com'''
    page = requests.get(url).text
    soup = BeautifulSoup(page, "lxml")
    text = [p.text for p in soup.find_all(class_="songLyricsV14 iComment-text")]
    print(url)
    return text

# URLs of transcripts in scope
urls = ['http://www.songlyrics.com/rihanna/we-found-love-lyrics/',
        'http://www.songlyrics.com/katy-perry/teenage-dream-lyrics/',
        'http://www.songlyrics.com/daft-punk-feat-pharrell-williams/get-lucky-radio-edit-lyrics/',
        'http://www.songlyrics.com/rolling-in-the-deep/rolling-in-the-deep-lyrics/',
        'http://www.songlyrics.com/niggas-in-paris/niggas-in-paris-lyrics/',
        'http://www.songlyrics.com/miley-cyrus/we-cant-stop-lyrics/',
        'http://www.songlyrics.com/drake/hold-on-were-going-home-lyrics/',
        'http://www.songlyrics.com/beyonc/countdown-lyrics/',
        'http://www.songlyrics.com/robyn/dancing-on-my-own-lyrics/',
        'http://www.songlyrics.com/miguel/adorn-lyrics/',
        'http://www.songlyrics.com/nicki-minaj/super-bass-lyrics/',
        'http://www.songlyrics.com/kacey-musgraves/follow-your-arrow-lyrics/',
        'http://www.songlyrics.com/lady-gaga/the-edge-of-glory-lyrics/',
        'http://www.songlyrics.com/sia/chandelier-lyrics/',
        'http://www.songlyrics.com/fun/some-nights-lyrics/']

# singers names
singers = ['rihanna', 'katy_perry', 'daft_punk', 'adele', 'jay_z', 'miley_cyrus', 'drake', 'beyonce', 'robyn', 'miguel', 'nicki_minaj', 'kacey','lady_gaga','sia','fun']

In [None]:
 # Actually request transcripts (takes a few minutes to run)
transcripts = [url_to_transcript(u) for u in urls]

In [None]:
# # Pickle files for later use

# # Make a new directory to hold the text files
!mkdir transcripts

for i, c in enumerate(singers):
     with open("transcripts/" + c + ".txt", "wb") as file:
        pickle.dump(transcripts[i], file)

In [None]:
# Load pickled files
data = {}
for i, c in enumerate(singers):
    with open("transcripts/" + c + ".txt", "rb") as file:
        data[c] = pickle.load(file)

In [None]:
data.keys()

In [None]:
data['rihanna'][:2]

In [None]:
next(iter(data.keys()))

In [None]:
next(iter(data.values()))

In [None]:
# We are going to change this to key: singer, value: string format
def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

In [None]:
# Combine it!
data_combined = {key: [combine_text(value)] for (key, value) in data.items()}

In [None]:
# We can either keep it in dictionary format or put it into a pandas dataframe
import pandas as pd
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame.from_dict(data_combined).transpose()
data_df.columns = ['transcript']
data_df = data_df.sort_index()
data_df

In [None]:
# Apply a first round of text cleaning techniques
import re
import string

def clean_text_round1(text):
    '''Make text lowercase, remove text in square brackets, remove punctuation and remove words containing numbers.'''
    text = text.lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    return text

round1 = lambda x: clean_text_round1(x)

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_df.transcript.apply(round1))
data_clean

In [None]:
# Apply a second round of cleaning
def clean_text_round2(text):
    '''Get rid of some additional punctuation and non-sensical text that was missed the first time around.'''
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', ' ', text)
    return text

round2 = lambda x: clean_text_round2(x)

In [None]:
# Let's take a look at the updated text
data_clean = pd.DataFrame(data_clean.transcript.apply(round2))
data_clean

In [None]:
# Let's take a look at our dataframe
data_df

In [None]:
# Let's add the singers' song as well
song = ['Rolling In the Deep', 'Countdown','Get Lucky','Hold On\', \'We\'re Goin\' Home','Some Nights','Niggas in Paris',
              'Follow Your Arrow','Teenage Dream', 'The Edge of Glory','Adorn',' We Can\'\t Stop',
              'Super Bass',' We Found Love','Dancing On My Own','Chandelier' ]

data_df['song'] = song
data_df

In [None]:
# Let's pickle it for later use
data_df.to_pickle("corpus.pkl")

In [None]:
# We are going to create a document-term matrix using CountVectorizer, and exclude common English stop words
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
data_cv = cv.fit_transform(data_clean.transcript)
data_dtm = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_dtm.index = data_clean.index
data_dtm

In [None]:
# Let's pickle it for later use
data_dtm.to_pickle("dtm.pkl")

In [None]:
# Let's also pickle the cleaned data (before we put it in document-term matrix format) and the CountVectorizer object
data_clean.to_pickle('data_clean.pkl')
pickle.dump(cv, open("cv.pkl", "wb"))

In [None]:
# Read in the document-term matrix
import pandas as pd

data = pd.read_pickle('dtm.pkl')
data = data.transpose()
data.head()

In [None]:
# Find the top 30 words sung by each singer
top_dict = {}
for c in data.columns:
    top = data[c].sort_values(ascending=False).head(30)
    top_dict[c]= list(zip(top.index, top.values))

top_dict

In [None]:
# Print the top 15 words sung by each singer
for singer, top_words in top_dict.items():
    print(singer)
    print(', '.join([word for word, count in top_words[0:14]]))
    print('---')

In [None]:
# Look at the most common top words --> add them to the stop word list
from collections import Counter

# Let's first pull out the top 30 words for each singer
words = []
for singer in data.columns:
    top = [word for (word, count) in top_dict[singer]]
    for t in top:
        words.append(t)
        
words

In [None]:
# Let's aggregate this list and identify the most common words along with how many routines they occur in
Counter(words).most_common()

In [None]:
# If more than half of the singers have it as a top word, exclude it from the list
add_stop_words = [word for word, count in Counter(words).most_common() if count > 6]
add_stop_words

In [None]:
# Let's update our document-term matrix with the new list of stop words
from sklearn.feature_extraction import text 
from sklearn.feature_extraction.text import CountVectorizer

# Read in cleaned data
data_clean = pd.read_pickle('data_clean.pkl')

# Add new stop words
stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)

# Recreate document-term matrix
cv = CountVectorizer(stop_words=stop_words)
data_cv = cv.fit_transform(data_clean.transcript)
data_stop = pd.DataFrame(data_cv.toarray(), columns=cv.get_feature_names())
data_stop.index = data_clean.index

# Pickle it for later use
import pickle
pickle.dump(cv, open("cv_stop.pkl", "wb"))
data_stop.to_pickle("dtm_stop.pkl")

In [None]:
# Let's make some word clouds!
from wordcloud import WordCloud

wc = WordCloud(stopwords=stop_words, background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42)

In [None]:
# Reset the output dimensions
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [16, 8]

song = ['Rolling In the Deep', 'Countdown','Get Lucky','Hold On\', \'We\'re Goin\' Home','Some Nights','Niggas in Paris',
              'Follow Your Arrow','Teenage Dream', 'The Edge of Glory','Adorn',' We Can\'\t Stop',
              'Super Bass',' We Found Love','Dancing On My Own','Chandelier' ]

# Create subplots for each singer
for index, singer in enumerate(data.columns):
    wc.generate(data_clean.transcript[singer])
    
    plt.subplot(3, 5, index+1)
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(song[index])
    
plt.show()

'We Found Love' had less repeating words than the other songs

In [None]:
# Earlier I said we'd revisit profanity. Let's take a look at the most common words again.
Counter(words).most_common()

In [None]:
# Let's isolate just these bad words
data_bad_words = data.transpose()[['fucked','fuck','mothafucka','mothafuckas', 'shit']]
data_profanity = pd.concat([data_bad_words.fucked + data_bad_words.fuck +data_bad_words.mothafucka+data_bad_words.mothafuckas, data_bad_words.shit], axis=1)
data_profanity.columns = ['f_word', 's_word']
data_profanity

'Niggas in Paris' with 8 's words' and 6 'f words' had the most bad words used in the song

In [None]:
# Let's create a scatter plot of our findings
plt.rcParams['figure.figsize'] = [14, 12]

for i, singer in enumerate(data_profanity.index):
    x = data_profanity.f_word.loc[singer]
    y = data_profanity.s_word.loc[singer]
    plt.scatter(x,y, color='blue')
    plt.text(x+.1,y+.1, song[i], fontsize=10)
    plt.xlim(-5,20 ) 
    
plt.title('Number of Bad Words Used in Song', fontsize=20)
plt.xlabel('Number of F Words', fontsize=15)
plt.ylabel('Number of S Words', fontsize=15)

plt.show()

'Niggas in Paris' with 8 's words' and 6 'f words' had the most bad words used in the song

In [None]:
# We'll start by reading in the corpus, which preserves word order
import pandas as pd

data = pd.read_pickle('corpus.pkl')
data

In [None]:
# Create quick lambda functions to find the polarity and subjectivity of each routine
from textblob import TextBlob

pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity

data['polarity'] = data['transcript'].apply(pol)
data['subjectivity'] = data['transcript'].apply(sub)
data

In [None]:
# Let's plot the results
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = [8, 10]

for index, comedian in enumerate(data.index):
    x = data.polarity.loc[comedian]
    y = data.subjectivity.loc[comedian]
    plt.scatter(x, y, color='blue')
    plt.text(x+.001, y+.001, data['song'][index], fontsize=15)
    plt.xlim(-.01, .12) 
    
plt.title('Sentiment Analysis', fontsize=20)
plt.xlabel('<-- Negative -------- Positive -->', fontsize=20)
plt.ylabel('<-- Facts -------- Opinions -->', fontsize=20)

plt.show()

'We Found Love' had a more Positive Sensitive Analysis than the others

In [None]:

# Split each routine into 10 parts
import numpy as np
import math

def split_text(text, n=10):
    '''Takes in a string of text and splits into n equal parts, with a default of 10 equal parts.'''

    # Calculate length of text, the size of each chunk of text and the starting points of each chunk of text
    length = len(text)
    size = math.floor(length / n)
    start = np.arange(0, length, size)
    
    # Pull out equally sized pieces of text and put it into a list
    split_list = []
    for piece in range(n):
        split_list.append(text[start[piece]:start[piece]+size])
    return split_list

In [None]:
# Let's take a look at our data again
data

In [None]:
# Let's create a list to hold all of the pieces of text
list_pieces = []
for t in data.transcript:
    split = split_text(t)
    list_pieces.append(split)
    
list_pieces

In [None]:
# Calculate the polarity for each piece of text

polarity_transcript = []
for lp in list_pieces:
    polarity_piece = []
    for p in lp:
        polarity_piece.append(TextBlob(p).sentiment.polarity)
    polarity_transcript.append(polarity_piece)
    
polarity_transcript

In [None]:
# Show the plot for one singer
plt.plot(polarity_transcript[0])
plt.title(data['song'].index[0])
plt.show()

In [None]:
# Show the plot for all singers
plt.rcParams['figure.figsize'] = [20,18]

for index, singer in enumerate(data.index):    
    plt.subplot(3, 5, index+1)
    plt.plot(polarity_transcript[index])
    plt.plot(np.arange(0,10), np.zeros(10))
    plt.title(data['song'][index])
    plt.ylim(ymin=-.6, ymax=.7)
    
plt.show()

'We Found Love' is more consistent than the other songs