# Documentation for exam paper in Social Data Science 2018

## Libraries
Here is all the libraries that we use in our project.

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import requests
import time
import json
import tqdm
import re
import nltk
from numpy import NaN
from bs4 import BeautifulSoup
from langdetect import detect
from nltk import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from nltk.corpus import stopwords
import gensim
import gensim.parsing.preprocessing
from gensim.parsing.preprocessing import preprocess_string
from gensim.models import CoherenceModel
from nltk.tokenize import TweetTokenizer
from sklearn.model_selection import train_test_split
from wordcloud import WordCloud
from multiprocessing import Pool
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics

## Web scraping

The code block below contains all functions for scrabing lyrics from genius.com.



In [None]:
# function that generates n pages for a given tag
def page_urls(n, tag):
  
  # generates the base url for a given tag
    base_url = 'https://genius.com/tags/' + tag + '/all?page='

    # define list for storing page links
    pages = []

    # for loop that generates n page links
    for i in range(n):
        page = base_url + str(i)
        pages.append(page)
    return pages

# function that gets all links from a given page
def get_links(html):

    # locating page results which contains list with lyrics
    soup = BeautifulSoup(html) 
    page_results = str(soup.find('ul',attrs={'class':'search_results song_list primary_list'}))

    # define set for storing links
    links = set()

    # for loop that locates the beginning of all links from the page result
    for link_loc in page_results.split('href="')[1:]:
    
        # splitting the link
        link = link_loc.split('"')[0]
      
        # adds the link to the set
        links.add(link)
    return links

# rate limit function that sleeps one second
def ratelimit():
    time.sleep(1)
    
# reliable requests
def get(url,iterations=10,check=lambda x: x.ok):
    # requests the url 10 times if response is false
    for iteration in range(iterations):
        try:
            ratelimit()
            response = requests.get(url)
            if check(response):
                return response
        except requests.exceptions as e:
            print(e)
    return None

# function that gets the lyrics in text format from link
def get_lyrics(html):
    # converts html from link to a soup object
    soup = BeautifulSoup(html, "html.parser") 
    # gets the lyrics in text format
    lyrics = soup.find("div", class_="lyrics").get_text()
    lyrics_dict = {'lyrics':lyrics}
    return lyrics_dict

# function that merges two dicts
def combine_dicts(x, y):
    # makes copy of dict x in variable z
    z = x.copy()
    # update dict z with dict y - merging
    z.update(y)
    # return dict z
    return z

# function that extracts the release date of the song
def get_year(html):
    # splitting after release date text in the html
    year1 = html.split('&quot;release_date&quot;')[1:]
    # splitting such that only the date is stored
    year2 = year1[0].split('&quot;')[1]
    # creating dictionary with the year
    year_json = {'year':year2}
    return year_json

# function song meta data
def get_details(html):
    # splitting after tracing_data text in the html
    data = html.split('TRACKING_DATA =')[1:]
    # splitting such that all the tracking data is stored
    data2 = data[0].split('}')[0] +'}'
    # creating dictionary with the meta data
    data3 = json.loads(str(data2).strip())
    # getting release date
    year = get_year(html)
    # merging release date on meta data
    details = combine_dicts(data3,year)
    # returning final dict with meta data
    return details

# function that can filter dictionaries
dictfilt = lambda x, y: dict([ (i,x[i]) for i in x if i in set(y) ])

# function that scrapes all the wanted data and returns it as a dictionary
def get_final(link):
    # if the link does not respond, log the link as False
    if get(link) == None:
        with open('log.txt', 'a') as f:
            f.write('[' + str(link) + ',' + str(False) + '],')
        # qutting such that the program doesnt stop due to non responding links
        return None
  # if the link does respond, log the link as True, and proceed
    else:
        html = get(link).text
        lyrics = get_lyrics(html)
        details = get_details(html)
        keep = set(['Annotatable Type', 'Lyrics Language', 'Primary Artist', 'Primary Tag', 'Song ID', 'Title', 'lyrics', 'year'])
        combined = combine_dicts(lyrics,details)
        with open('log.txt', 'a') as f:
            f.write('[' + str(link) + ',' + str(True) + '],')
    return dictfilt(combined,keep)

We now generate alle the page urls, that are used to scrape the individual song lyrics.

In [None]:
# page urls for each genre
pages_rock = page_urls(51, 'rock')
pages_rap = page_urls(51, 'rap')
pages_country = page_urls(51, 'country')
pages_pop = page_urls(51, 'pop')

# combining all page lists
pages = pages_rock + pages_rap + pages_country + pages_pop 

The scraper now iterates through all page urls to collect all song urls. The song urls is collected in a set to avoid duplicates.

In [None]:
song_links = set() # container that gets rid og duplicates
for song in tqdm.tqdm(pages): # set len(pages) to scrape all songs from page links
    html = get(song).text # getting the html
    links = get_links(html) # extracting links
    for link in links:
        song_links.add(link)
song_links = list(song_links)
df = pd.DataFrame(song_links)
df.to_csv('links.csv')

The scraper now iterates over alle song urls to extract the need information from the html code. Finally, the list is converted to pandas dataframe and saved as a csv file.

In [None]:
song_links = pd.read_csv('links.csv')
print('There is ' + str(len(song_links)) + ' unique song links.')

In [None]:
lyrics = []
for i in tqdm.tqdm(song_links):
    lyric = get_final(i)
    lyrics.append(lyric)

df_lyrics = pd.DataFrame(lyrics)
df.to_csv('data.csv')

## Cleaning the data

We load in the data generated from the web scraping process above. We clean, select and preprocess the data. The size of the final data set is reported, when running the code.

In [None]:
# loading data
df = pd.read_csv('data.csv')

print('The loaded data set contains of ' + str(len(df)) + ' lyrics')

# fitting columns
df = df.drop(['Unnamed: 0'], axis = 1)
cols = ['type', 'language', 'artist', 'tag', 'id', 'title', 'lyrics', 'release_date']
df.columns = cols
print('The column names of the data set is ' + str(cols))

# slicing the release date to extract year
df['year'] = df['release_date'].apply(lambda i: i[:4]).apply(lambda i: re.sub('rele', '0', i)).apply(lambda i:int(i))

# selection criteria
select_en = df.language == 'en' # select only english text
select_tag = df.tag != 'non-music' # select only music
select_cat = df.tag != 'r-b' # select only rock, pop, rap, country
select_rows = select_en & select_tag & select_cat # combining criteria

# applying criteria
df_select = df[select_rows == True].reset_index().copy()
lost = len(df)-len(df_select)
print('We loose ' + str(lost) + ' lyrics by applying the selection criteria, the data set now contain ' + str(len(df_select)))

# lower case and punctuation clean
df_select['lyrics'] = df_select.loc[:,'lyrics'].apply(lambda i: re.sub('\[[^\]]+\]', '', i)) # removes chorus
df_select['lyrics_clean'] = df_select.lyrics.apply(lambda i: i.lower()) # converts to lower case
df_select['lyrics_clean'] = df_select.lyrics_clean.apply(lambda i: re.sub('[^0-9a-z\']+', ' ', i)) #removes non characters


### Tokenizing of the lyrics

We tokenize the words using tweet tokenizer, since this package is better at handling words with appostrphes. There after stop words is removed. Finally, we also create a token list exxcluding some specially selected words.

In [None]:
tknzr = TweetTokenizer()

df_select['word_tokens'] = df_select['lyrics_clean'].apply(lambda i: tknzr.tokenize(i))

# remove stop words
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))
df_select['word_tokens2'] = df_select['word_tokens'].apply(lambda i: [j for j in i if j not in stopWords])

# remove stop words including sound words
new_stopwords = set(["oh", "ooh", "yeah", "na", "la", "uh", "ayy", "da", "hey", "yo", "yay", "ah", "aya","\'"])
df_select['word_tokens3'] = df_select['word_tokens2'].apply(lambda i: [j for j in i if j not in new_stopwords])


We now create bigrams and trigrams that are used for our topic modelling.

In [None]:
# bigrams and trigrams
bigram = gensim.models.Phrases(df_select['word_tokens3'], min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[df_select['word_tokens3']], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]

def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]

df_select['bigrams'] = make_bigrams(df_select['word_tokens3'])
df_select['word_tokens4'] = make_trigrams(df_select['bigrams'])

In [None]:
# stemming words
ps = PorterStemmer()
df_select['word_tokens5'] = df_select['word_tokens4'].apply(lambda i: [ps.stem(j) for j in i])

## Exploring the data

The following code contains a plot for the distribution of songs across genres and years.

In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)

# setting plot settings
fig, axes = plt.subplots(ncols=2, figsize = (10,5))
ax1, ax2 = axes.flatten()

# plot input for subplot 1
data = dict(df_select['tag'].value_counts())
tags = list(data.keys())
count = list(data.values())

# setting title
ax1.set_title('(a)\nNumber of songs', pad = 20)

# removes plot frame
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.spines['left'].set_visible(False)


# adjusts x-axis and y-axis
# removing y-axis
ax1.set_yticks([])

# removes y-ticks and x-ticks
ax1.tick_params(axis='both', which='both', length=0) 

# setting x-ticks
plt.xticks([0,1,2,3], ('Rap', 'Pop', 'Country', 'Rock'))

# makes bar plot
b0, b1, b2, b3 = ax1.bar(('Rap', 'Pop', 'Country', 'Rock'), count) # barplot input
b0.set_facecolor('#004c6d') # color for bar 1
b1.set_facecolor('#3d708f') # color for bar 2
b2.set_facecolor('#6996b3') # color for bar 3
b3.set_facecolor('#94bed9') # color for bar 4

# setting text attributes to bar plot, loops over all bars
for bar in [b0,b1,b2,b3]: 
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2.0, height, '%d' % int(height), ha='center', va='bottom')
    
# plot input for subplot 2
year_dist = df_select[(df_select.year != 0) == True].year
x1 = sum(i >= 2015 for i in year_dist)
x2 = sum(i >= 2010 for i in year_dist) - x1
x3 = sum(i >= 1990 for i in year_dist) - x1 - x2
x4 = sum(i >= 1970 for i in year_dist) - x1 - x2 - x3
x5 = sum(i <= 1969 for i in year_dist)
labels = ['1850-1969','1970-1989','1990-2009','2010-2014', '2015-2018']
count1 = [x5, x4, x3, x2, x1]

# setting title
ax2.set_title('(b)\nNumber of songs in year intervals', pad = 20)

# removes plot frame
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)


# adjusts x-axis and y-axis
# removing y-axis
ax2.set_yticks([]) 

# removing y-ticks and x-ticks
ax2.tick_params(axis='both', which='both', length=0) 

# Over writing x-ticks
plt.xticks([0,1,2,3,4], ('1850-1969','1970-1989','1990-2009','2010-2014', '2015-2018'), fontsize = 11) 

# makes bar plot
b0_uw, b1_uw, b2_uw, b3_uw, b4_uw = ax2.bar([0,1,2,3,4], count1) # barplot input
b0_uw.set_facecolor('#004c6d') # color for bar 1
b1_uw.set_facecolor('#3d708f') # color for bar 2
b2_uw.set_facecolor('#6996b3') # color for bar 3
b3_uw.set_facecolor('#94bed9') # color for bar 4
b4_uw.set_facecolor('#c1e7ff') # color for bar 5

# setting text attributes to bar plot, loops over all bars
for bar in [b0_uw,b1_uw,b2_uw,b3_uw,b4_uw]: 
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2.0, height, '%d' % int(height), ha='center', va='bottom')
    
# displaying and saving figure
fig.tight_layout()
plt.show(ax1, ax2)
fig.savefig('dist_plot.png', dpi=500)

In [None]:
print('The number of unique artists is ' + str(df_select['artist'].nunique()))

In [None]:
df_select.groupby('tag')['artist'].nunique()

## Descriptives

Creating new variables for characterising the lyrics.

In [None]:
# unique words
df_select['unique_words'] = df_select['word_tokens'].apply(lambda i: pd.DataFrame(i).nunique())
# number of words
df_select['word_count'] = df_select.word_tokens.apply(lambda i: len(i))
# share of unique words
df_select['share_unique_words'] = df_select['unique_words']/df_select['word_count']*100
# characters in lyric
df_select['characters'] = df_select.lyrics_clean.apply(lambda i: len(re.sub(' ', '',i)))
# characters per word
df_select['characters_per_word'] = df_select.characters/df_select.word_count

In [None]:
df_select.groupby('tag')['share_unique_words'].describe()

In [None]:
df_select.groupby('tag')['word_count'].describe()

In [None]:
df_select.groupby('tag')['characters_per_word'].describe()

The following code block generates a plot for the average word count and the average share of unique words across the four genres.

In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)

# setting plot settings
fig, axes = plt.subplots(ncols=2, figsize = (10,5))
ax1, ax2 = axes.flatten()


# plot input for subplot 1
mean = df_select.groupby(['tag'])['word_count'].mean()
mean_df = pd.DataFrame(mean).sort_values(by=['word_count'], ascending=False)
count = list(mean_df['word_count'])

# setting title
ax1.set_title('(a)\nAverage number of words', pad = 20)

# removes plot frame
ax1.spines['top'].set_visible(False)
ax1.spines['right'].set_visible(False)
ax1.spines['bottom'].set_visible(False)
ax1.spines['left'].set_visible(False)


# adjusts x-axis and y-axis
# removing y-axis
ax1.set_yticks([])

# removes y-ticks and x-ticks
ax1.tick_params(axis='both', which='both', length=0) 

# setting x-ticks
plt.xticks([0,1,2,3], ('Rap', 'Pop', 'Country', 'Rock'))



# makes bar plot
b0, b1, b2, b3 = ax1.bar(('Rap', 'Pop', 'Country', 'Rock'), count) # barplot input
b0.set_facecolor('#004c6d') # color for bar 1
b1.set_facecolor('#3d708f') # color for bar 2
b2.set_facecolor('#6996b3') # color for bar 3
b3.set_facecolor('#94bed9') # color for bar 4


# setting text attributes to bar plot, loops over all bars
for bar in [b0,b1,b2,b3]: 
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2.0, height, '%d' % int(height), ha='center', va='bottom')

    
# plot input for subplot 2
mean_uw = df_select.groupby(['tag'])['share_unique_words'].mean()
mean_uw_sort = pd.DataFrame(mean_uw).sort_values(by=['share_unique_words'], ascending=False)
count1 = list(mean_uw_sort['share_unique_words'])

# setting title
ax2.set_title('(b)\nShare of unique words', pad = 20)

# removes plot frame
ax2.spines['top'].set_visible(False)
ax2.spines['right'].set_visible(False)
ax2.spines['bottom'].set_visible(False)
ax2.spines['left'].set_visible(False)


# adjusts x-axis and y-axis
# removing y-axis
ax2.set_yticks([]) 

# removing y-ticks and x-ticks
ax2.tick_params(axis='both', which='both', length=0) 

# Over writing x-ticks
plt.xticks([0,1,2,3], ('Country', 'Rock', 'Rap', 'Pop')) # Over writing x-ticks

# makes bar plot
b0_uw, b1_uw, b2_uw, b3_uw = ax2.bar([0,1,2,3], count1) # barplot input
b0_uw.set_facecolor('#004c6d') # color for bar 1
b1_uw.set_facecolor('#3d708f') # color for bar 2
b2_uw.set_facecolor('#6996b3') # color for bar 3
b3_uw.set_facecolor('#94bed9') # color for bar 4

# setting text attributes to bar plot, loops over all bars
for bar in [b0_uw,b1_uw,b2_uw,b3_uw]: 
    height = bar.get_height()
    heighttxt = str(round(height,1)) + ' pct.'
    plt.text(bar.get_x() + bar.get_width()/2.0, height, heighttxt, ha='center', va='bottom')
    
# displaying and saving figure
fig.tight_layout()
plt.show(ax1, ax2)
fig.savefig('word_plot.png', dpi=500)

Boxplot displaying the distribution of the average characters per word

In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)

# figure settings
fig, ax = plt.subplots(figsize=(10, 5))

# data for the boxplot
data_to_plot = df_select[['characters_per_word','tag']]
data_rap =np.array(data_to_plot[(data_to_plot.tag == 'rap') == True]['characters_per_word'])
data_rock = np.array(data_to_plot[(data_to_plot.tag == 'pop') == True]['characters_per_word'])
data_pop = np.array(data_to_plot[(data_to_plot.tag == 'rock') == True]['characters_per_word'])
data_country = np.array(data_to_plot[(data_to_plot.tag == 'country') == True]['characters_per_word'])
data_box = [data_rap,data_rock,data_pop,data_country]

# removes frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticklabels(['Rap', 'Rock', 'Pop', 'Country'])

# enables adding colors
bp = ax.boxplot(data_box, patch_artist=True)

# change outline color, fill color and linewidth of the boxes

# looping over all boxes to change the line color and fill
for box in bp['boxes']:
    box.set( color='#3d708f', linewidth=2)
    box.set( facecolor = '#3d708f' )

# lopping over all whiskers to change color
for whisker in bp['whiskers']:
    whisker.set(color='#004c6d', linewidth=2)

# lopping over all caps to change color
for cap in bp['caps']:
    cap.set(color='#004c6d', linewidth=2)

# lopping over all medians to change color
for median in bp['medians']:
    median.set(color='#94bed9', linewidth=2)

# lopping over all outlier marks to change color and apperance
for flier in bp['fliers']:
    flier.set(marker='o', color='#004c6d', alpha=0.2)

# setting xticklabels
ax.set_xticklabels(['Rap', 'Pop', 'Rock', 'Country'])

# setting title
ax.set_title('Characters per word', pad = 20)

# removing y-ticks and x-ticks
ax.tick_params(axis='both', which='both', length=0) 

# saving figure
fig.savefig('box.png', dpi=500)

## Wordclouds

In [None]:
# function to generate wordcloud image
def plot_word_cloud(words, filename=None):
    wordcloud = WordCloud(
        width=800,
        height=600,
        max_words=500,
        scale=3,
        background_color = 'white'
    )
    wordcloud.generate_from_frequencies(dict(words))
    plt.figure(figsize=(15,20))
    plt.imshow(wordcloud)
    plt.axis("off")
    
    if filename is not None:
        plt.savefig(filename, bbox_inches='tight', dpi=500)
    
    plt.show()

In [None]:
#Generating datasets for different genres
select_rap = df_select.tag == 'rap'
df_rap = df_select[select_rap==True]

select_pop = df_select.tag == 'pop'
df_pop = df_select[select_pop==True]

select_rock = df_select.tag == 'rock'
df_rock = df_select[select_rock==True]

select_country = df_select.tag == 'country'
df_country = df_select[select_country==True]

### Rap

In [None]:
#Wordcloud for rap
cv = CountVectorizer(max_features = 50, stop_words=stopWords, analyzer='word')
counts_rap = cv.fit_transform(df_rap['lyrics_clean'])                                               
words_rap = np.array(cv.get_feature_names()) 

number_rap = pd.DataFrame(counts_rap.A, columns=cv.get_feature_names())

comeon_rap = number_rap.sum(axis=0)
jeg_tror_den_er_her_rap=comeon_rap.to_dict()

plot_word_cloud(jeg_tror_den_er_her_rap, filename = "wordcloud_rap.png")

### Pop

In [None]:
#Wordcloud for pop
cv = CountVectorizer(max_features = 50, stop_words=stopWords, analyzer='word')
counts_pop = cv.fit_transform(df_pop['lyrics_clean'])                                               
words_pop = np.array(cv.get_feature_names()) 

number_pop = pd.DataFrame(counts_pop.A, columns=cv.get_feature_names())

comeon_pop = number_pop.sum(axis=0)
jeg_tror_den_er_her_pop=comeon_pop.to_dict()

plot_word_cloud(jeg_tror_den_er_her_pop, filename = "wordcloud_pop.png")

### Rock

In [None]:
#Wordcloud for rock
cv = CountVectorizer(max_features = 50, stop_words=stopWords, analyzer='word')
counts_rock = cv.fit_transform(df_rock['lyrics_clean'])                                               
words_rock = np.array(cv.get_feature_names()) 

number_rock = pd.DataFrame(counts_rock.A, columns=cv.get_feature_names())

comeon_rock = number_rock.sum(axis=0)
jeg_tror_den_er_her_rock=comeon_rock.to_dict()

plot_word_cloud(jeg_tror_den_er_her_rock, filename = "wordcloud_rock.png")

### Country

In [None]:
#Wordcloud for country
cv = CountVectorizer(max_features = 50, stop_words=stopWords, analyzer='word')
counts_country = cv.fit_transform(df_country['lyrics_clean'])                                               
words_country = np.array(cv.get_feature_names()) 

number_country = pd.DataFrame(counts_country.A, columns=cv.get_feature_names())

comeon_country = number_country.sum(axis=0)
jeg_tror_den_er_her_country=comeon_country.to_dict()

plot_word_cloud(jeg_tror_den_er_her_country, filename = "wordcloud_country.png")

## Topic modelling

### Rap

In [None]:
processed_docs_rap = df_rap['word_tokens4']

# Topic model with gensim

dictionary_rap = gensim.corpora.Dictionary(processed_docs_rap)
dictionary_rap.filter_extremes(no_below=15, no_above=0.5, keep_n = 10000)
bow_corpus_rap = [dictionary_rap.doc2bow(doc) for doc in processed_docs_rap]
n_cores = 12
k = 2
lda_model_rap = gensim.models.LdaMulticore(bow_corpus_rap, num_topics=k, id2word=dictionary_rap, passes=2, random_state=42)

#lda_model_rap.save('rap.gensim')

In [None]:
topics_rap = lda_model_rap.print_topics(num_words=10)
for topic in topics_rap:
    print(topic)

In [None]:
# Coherence Score
coherence_model_lda_rap = CoherenceModel(model=lda_model_rap, texts=processed_docs_rap, dictionary=dictionary_rap, coherence='c_v')
coherence_lda_rap = coherence_model_lda_rap.get_coherence()
print('\nCoherence Score: ', coherence_lda_rap)

### Pop

In [None]:
processed_docs_pop = df_pop['word_tokens4']

# Topic model with gensim

dictionary_pop = gensim.corpora.Dictionary(processed_docs_pop)
dictionary_pop.filter_extremes(no_below=15, no_above=0.6, keep_n = 10000)
bow_corpus_pop = [dictionary_pop.doc2bow(doc) for doc in processed_docs_pop]
n_cores = 12
k = 2
lda_model_pop = gensim.models.LdaMulticore(bow_corpus_pop, num_topics=k, id2word=dictionary_pop, passes=2, random_state = 42)

#lda_model_pop.save('pop.gensim')

In [None]:
topics_pop = lda_model_pop.print_topics(num_words=10)
for topic in topics_pop:
    print(topic)

In [None]:
# Coherence Score
coherence_model_lda_pop = CoherenceModel(model=lda_model_pop, texts=processed_docs_pop, dictionary=dictionary_pop, coherence='c_v')
coherence_lda_pop = coherence_model_lda_pop.get_coherence()
print('\nCoherence Score: ', coherence_lda_pop)

### Rock

In [None]:
processed_docs_rock = df_rock['word_tokens4']

# Topic model with gensim

dictionary_rock = gensim.corpora.Dictionary(processed_docs_rock)
dictionary_rock.filter_extremes(no_below=15, no_above=0.5, keep_n = 10000)
bow_corpus_rock = [dictionary_rock.doc2bow(doc) for doc in processed_docs_rock]
n_cores = 12
k = 2
lda_model_rock = gensim.models.LdaMulticore(bow_corpus_rock, num_topics=k, id2word=dictionary_rock, passes=2, random_state = 42)

#lda_model_rock.save('rock.gensim')

In [None]:
topics_rock = lda_model_rock.print_topics(num_words=10)
for topic in topics_rock:
    print(topic)

In [None]:
# Coherence Score
coherence_model_lda_rock = CoherenceModel(model=lda_model_rock, texts=processed_docs_rock, dictionary=dictionary_rock, coherence='c_v')
coherence_lda_rock = coherence_model_lda_rock.get_coherence()
print('\nCoherence Score: ', coherence_lda_rock)

### Country

In [None]:
processed_docs_country = df_country['word_tokens4']

# Topic model with gensim

dictionary_country = gensim.corpora.Dictionary(processed_docs_country)
dictionary_country.filter_extremes(no_below=15, no_above=0.5, keep_n = 10000)
bow_corpus_country = [dictionary_country.doc2bow(doc) for doc in processed_docs_country]
n_cores = 12
k = 2
lda_model_country = gensim.models.LdaMulticore(bow_corpus_country, num_topics=k, id2word=dictionary_country, passes=2, random_state = 42)

#lda_model_country.save('rock.gensim')

In [None]:
topics_country = lda_model_country.print_topics(num_words=10)
for topic in topics_country:
    print(topic)

In [None]:
# Coherence Score
coherence_model_lda_country = CoherenceModel(model=lda_model_country, texts=processed_docs_country, dictionary=dictionary_country, coherence='c_v')
coherence_lda_country = coherence_model_lda_country.get_coherence()
print('\nCoherence Score: ', coherence_lda_country)

### Second topic modelling approach

In [None]:
X = df_select[['word_tokens4', 'tag']]

X_train, X_test = train_test_split(X, test_size=0.5, random_state=42)

In [None]:
print(len(X_train))
print(len(X_test))

X_test_rap = X_test.word_tokens4[X_test['tag']=='rap']
X_test_pop = X_test.word_tokens4[X_test['tag']=='pop']
X_test_rock = X_test.word_tokens4[X_test['tag']=='rock']
X_test_country = X_test.word_tokens4[X_test['tag']=='country']

In [None]:
processed_docs = X_train['word_tokens4']

dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n = 10000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
#n_cores = 12
k = 2
lda_model = gensim.models.LdaModel(bow_corpus, num_topics=k, id2word=dictionary, passes=10, random_state=42)

#lda_model.save('train.gensim')

In [None]:
topics = lda_model.print_topics(num_words=20)
for topic in topics:
    print(topic)

In [None]:
rap = []
for doc in X_test_rap:
    new_doc_bow = dictionary.doc2bow(doc)
    rap.append(lda_model.get_document_topics(new_doc_bow))

rock = []
for doc in X_test_rock:
    new_doc_bow = dictionary.doc2bow(doc)
    rock.append(lda_model.get_document_topics(new_doc_bow))

pop = []
for doc in X_test_pop:
    new_doc_bow = dictionary.doc2bow(doc)
    pop.append(lda_model.get_document_topics(new_doc_bow))

country = []
for doc in X_test_country:
    new_doc_bow = dictionary.doc2bow(doc)
    country.append(lda_model.get_document_topics(new_doc_bow))

#### Rap

In [None]:
topic0 = []
topic1 = []

for song in rap:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])

    

avg_topic0 = sum(topic0)/len(topic0)
avg_topic1 = sum(topic1)/len(topic1)

print(avg_topic0)
print(avg_topic1)



In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)

# figure settings
fig, ax = plt.subplots(figsize=(5, 4))

# data for the boxplot
data_box = [topic0,topic1]

# removes frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# enables adding colors
bp = ax.boxplot(data_box, widths=0.5, patch_artist=True)

# change outline color, fill color and linewidth of the boxes

# looping over all boxes to change the line color and fill
for box in bp['boxes']:
    box.set( color='#3d708f', linewidth=2)
    box.set( facecolor = '#3d708f' )

# lopping over all whiskers to change color
for whisker in bp['whiskers']:
    whisker.set(color='#004c6d', linewidth=2)

# lopping over all caps to change color
for cap in bp['caps']:
    cap.set(color='#004c6d', linewidth=2)

# lopping over all medians to change color
for median in bp['medians']:
    median.set(color='#94bed9', linewidth=2)

# lopping over all outlier marks to change color and apperance
for flier in bp['fliers']:
    flier.set(marker='o', color='#004c6d', alpha=0.2)

# setting xticklabels
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# setting title
ax.set_title('Rap', pad = 20)

# removing y-ticks and x-ticks
ax.tick_params(axis='both', which='both', length=0) 

# saving figure
fig.savefig('box0.png', dpi=500)

#### Pop

In [None]:
topic0 = []
topic1 = []

for song in pop:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])

    

avg_topic0 = sum(topic0)/len(topic0)
avg_topic1 = sum(topic1)/len(topic1)

print(avg_topic0)
print(avg_topic1)



In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)

# figure settings
fig, ax = plt.subplots(figsize=(5, 4))

# data for the boxplot
data_box = [topic0,topic1]

# removes frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# enables adding colors
bp = ax.boxplot(data_box, widths=0.5, patch_artist=True)

# change outline color, fill color and linewidth of the boxes

# looping over all boxes to change the line color and fill
for box in bp['boxes']:
    box.set( color='#3d708f', linewidth=2)
    box.set( facecolor = '#3d708f' )

# lopping over all whiskers to change color
for whisker in bp['whiskers']:
    whisker.set(color='#004c6d', linewidth=2)

# lopping over all caps to change color
for cap in bp['caps']:
    cap.set(color='#004c6d', linewidth=2)

# lopping over all medians to change color
for median in bp['medians']:
    median.set(color='#94bed9', linewidth=2)

# lopping over all outlier marks to change color and apperance
for flier in bp['fliers']:
    flier.set(marker='o', color='#004c6d', alpha=0.2)

# setting xticklabels
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# setting title
ax.set_title('Pop', pad = 20)

# removing y-ticks and x-ticks
ax.tick_params(axis='both', which='both', length=0) 

# saving figure
fig.savefig('box1.png', dpi=500)

#### Rock

In [None]:
topic0 = []
topic1 = []

for song in rock:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])

    

avg_topic0 = sum(topic0)/len(topic0)
avg_topic1 = sum(topic1)/len(topic1)

print(avg_topic0)
print(avg_topic1)



In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)

# figure settings
fig, ax = plt.subplots(figsize=(5, 4))

# data for the boxplot
data_box = [topic0,topic1]

# removes frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# enables adding colors
bp = ax.boxplot(data_box, widths=0.5, patch_artist=True)

# change outline color, fill color and linewidth of the boxes

# looping over all boxes to change the line color and fill
for box in bp['boxes']:
    box.set( color='#3d708f', linewidth=2)
    box.set( facecolor = '#3d708f' )

# lopping over all whiskers to change color
for whisker in bp['whiskers']:
    whisker.set(color='#004c6d', linewidth=2)

# lopping over all caps to change color
for cap in bp['caps']:
    cap.set(color='#004c6d', linewidth=2)

# lopping over all medians to change color
for median in bp['medians']:
    median.set(color='#94bed9', linewidth=2)

# lopping over all outlier marks to change color and apperance
for flier in bp['fliers']:
    flier.set(marker='o', color='#004c6d', alpha=0.2)

# setting xticklabels
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# setting title
ax.set_title('Rock', pad = 20)

# removing y-ticks and x-ticks
ax.tick_params(axis='both', which='both', length=0) 

# saving figure
fig.savefig('box2.png', dpi=500)

#### Country

In [None]:
topic0 = []
topic1 = []

for song in country:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])

    

avg_topic0 = sum(topic0)/len(topic0)
avg_topic1 = sum(topic1)/len(topic1)

print(avg_topic0)
print(avg_topic1)



In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)

# figure settings
fig, ax = plt.subplots(figsize=(5, 4))

# data for the boxplot
data_box = [topic0,topic1]

# removes frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# enables adding colors
bp = ax.boxplot(data_box, widths=0.5, patch_artist=True)

# change outline color, fill color and linewidth of the boxes

# looping over all boxes to change the line color and fill
for box in bp['boxes']:
    box.set( color='#3d708f', linewidth=2)
    box.set( facecolor = '#3d708f' )

# lopping over all whiskers to change color
for whisker in bp['whiskers']:
    whisker.set(color='#004c6d', linewidth=2)

# lopping over all caps to change color
for cap in bp['caps']:
    cap.set(color='#004c6d', linewidth=2)

# lopping over all medians to change color
for median in bp['medians']:
    median.set(color='#94bed9', linewidth=2)

# lopping over all outlier marks to change color and apperance
for flier in bp['fliers']:
    flier.set(marker='o', color='#004c6d', alpha=0.2)

# setting xticklabels
ax.set_xticklabels(['Topic 0', 'Topic 1'])

# setting title
ax.set_title('Country', pad = 20)

# removing y-ticks and x-ticks
ax.tick_params(axis='both', which='both', length=0) 

# saving figure
fig.savefig('box3.png', dpi=500)

## Topic modelling with four topics

In [None]:
processed_docs = X_train['word_tokens4']

dictionary = gensim.corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n = 10000)
bow_corpus = [dictionary.doc2bow(doc) for doc in processed_docs]
#n_cores = 12
k = 4
lda_model = gensim.models.LdaModel(bow_corpus, num_topics=k, id2word=dictionary, passes=10, random_state=42)

#lda_model.save('train.gensim')

In [None]:
topics = lda_model.print_topics(num_words=20)
for topic in topics:
    print(topic)

In [None]:
rap = []
for doc in X_test_rap:
    new_doc_bow = dictionary.doc2bow(doc)
    rap.append(lda_model.get_document_topics(new_doc_bow))

rock = []
for doc in X_test_rock:
    new_doc_bow = dictionary.doc2bow(doc)
    rock.append(lda_model.get_document_topics(new_doc_bow))

pop = []
for doc in X_test_pop:
    new_doc_bow = dictionary.doc2bow(doc)
    pop.append(lda_model.get_document_topics(new_doc_bow))

country = []
for doc in X_test_country:
    new_doc_bow = dictionary.doc2bow(doc)
    country.append(lda_model.get_document_topics(new_doc_bow))

In [None]:
avg_topic0 = []
avg_topic1 = []
avg_topic2 = []
avg_topic3 = []

### Rap

In [None]:
topic0 = []
topic1 = []
topic2 = []
topic3 = []


for song in rap:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])
        elif c[0] == 2:
            topic2.append(c[1])
        elif c[0] == 3:
            topic3.append(c[1])

    
avg_topic0.append(sum(topic0)/len(topic0))
avg_topic1.append(sum(topic1)/len(topic1))
avg_topic2.append(sum(topic2)/len(topic2))
avg_topic3.append(sum(topic3)/len(topic3))

print(avg_topic0)
print(avg_topic1)
print(avg_topic2)
print(avg_topic3)

### Pop

In [None]:
topic0 = []
topic1 = []
topic2 = []
topic3 = []


for song in pop:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])
        elif c[0] == 2:
            topic2.append(c[1])
        elif c[0] == 3:
            topic3.append(c[1])

    
avg_topic0.append(sum(topic0)/len(topic0))
avg_topic1.append(sum(topic1)/len(topic1))
avg_topic2.append(sum(topic2)/len(topic2))
avg_topic3.append(sum(topic3)/len(topic3))

print(avg_topic0)
print(avg_topic1)
print(avg_topic2)
print(avg_topic3)

### Rock

In [None]:
topic0 = []
topic1 = []
topic2 = []
topic3 = []


for song in rock:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])
        elif c[0] == 2:
            topic2.append(c[1])
        elif c[0] == 3:
            topic3.append(c[1])

    
avg_topic0.append(sum(topic0)/len(topic0))
avg_topic1.append(sum(topic1)/len(topic1))
avg_topic2.append(sum(topic2)/len(topic2))
avg_topic3.append(sum(topic3)/len(topic3))

print(avg_topic0)
print(avg_topic1)
print(avg_topic2)
print(avg_topic3)

### Country

In [None]:
topic0 = []
topic1 = []
topic2 = []
topic3 = []


for song in country:
    for c in song:
        if c[0] == 0:
            topic0.append(c[1])
        elif c[0] == 1:
            topic1.append(c[1])
        elif c[0] == 2:
            topic2.append(c[1])
        elif c[0] == 3:
            topic3.append(c[1])

    
avg_topic0.append(sum(topic0)/len(topic0))
avg_topic1.append(sum(topic1)/len(topic1))
avg_topic2.append(sum(topic2)/len(topic2))
avg_topic3.append(sum(topic3)/len(topic3))

print(avg_topic0)
print(avg_topic1)
print(avg_topic2)
print(avg_topic3)

In [None]:
top = pd.DataFrame([avg_topic0,avg_topic1,avg_topic2,avg_topic3])
cols = ['Rap','Pop','Rock','Country']
top.columns = cols

In [None]:
# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)


#fig, (ax1, ax2) = plt.subplots(ncols = 2, figsize = (10,5)) # canvas
fig, ax = plt.subplots(figsize = (10,5))

# removes frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)

# setting title
ax.set_title('Topics', pad = 20) 

# removing y-ticks and x-ticks
ax.tick_params(axis='both', which='both', length=0) 

# plot adjustments
top.plot(kind='bar', ax = ax)
ax.set_title('Topics', pad = 20) # setting title
ax.set_xlabel(xlabel='')
ax.set_xticklabels(['Topic 0', 'Topic 1', 'Topic 2', 'Topic 3'], rotation = 0)
ax.get_children()[0].set_color('#004c6d')
ax.get_children()[1].set_color('#004c6d')
ax.get_children()[2].set_color('#004c6d')
ax.get_children()[3].set_color('#004c6d')
ax.get_children()[4].set_color('#3d708f')
ax.get_children()[5].set_color('#3d708f')
ax.get_children()[6].set_color('#3d708f')
ax.get_children()[7].set_color('#3d708f')
ax.get_children()[8].set_color('#6996b3')
ax.get_children()[9].set_color('#6996b3')
ax.get_children()[10].set_color('#6996b3')
ax.get_children()[11].set_color('#6996b3')
ax.get_children()[12].set_color('#94bed9')
ax.get_children()[13].set_color('#94bed9')
ax.get_children()[14].set_color('#94bed9')
ax.get_children()[15].set_color('#94bed9')
ax.legend(['Rap', 'Pop', 'Rock', 'Country'], frameon=False, loc='best')

# saving figure
fig.savefig('4topics.png', dpi=500)

## Sentiment analysis using VADER

In [None]:
# analyzer function
analyzer = SentimentIntensityAnalyzer()
def tag(i):
    return analyzer.polarity_scores(i)

p = Pool(12)

inputs = df_select['lyrics_clean']
outputs = []

for result in p.imap(tag, inputs):
    outputs.append(result)
    
p.close()

# setting the results on the df_select dataframe
df_select['sentiment'] = [outputs[i] for i in range(len(df_select))]

Splitting the sentiment dictionary into four columns.

In [None]:
df_select['neg'] = df_select.sentiment.apply(lambda i: [k for j, k in i.items() if j == 'neg'][0])
df_select['neu'] = df_select.sentiment.apply(lambda i: [k for j, k in i.items() if j == 'neu'][0])
df_select['pos'] = df_select.sentiment.apply(lambda i: [k for j, k in i.items() if j == 'pos'][0])
df_select['compound'] = df_select.sentiment.apply(lambda i: [k for j, k in i.items() if j == 'compound'][0])

Plot for the sentiment composition

In [None]:
data_sent = pd.DataFrame(df_select.groupby('tag').mean()[['neg','pos']])

# font settings
font = {'family' : 'arial',
        'weight' : 'normal',
        'size'   : 16}
plt.rc('font', **font)


#fig, (ax1, ax2) = plt.subplots(ncols = 2, figsize = (10,5)) # canvas
fig, ax = plt.subplots(figsize = (10,5))

# removes frame
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['bottom'].set_visible(False)
ax.spines['left'].set_visible(False)

# setting title
ax.set_title('Characters per word', pad = 20) 

# removing y-ticks and x-ticks
ax.tick_params(axis='both', which='both', length=0) 

# plot adjustments
data_sent.plot(kind='bar', stacked = True, ax = ax)
ax.set_title('Sentiment composition', pad = 20) # setting title
ax.set_xlabel(xlabel='')
ax.set_ylim(top=0.35)
ax.set_xticklabels(['Country', 'Pop', 'Rap', 'Rock'], rotation = 0)
ax.get_children()[0].set_color('#004c6d')
ax.get_children()[1].set_color('#004c6d')
ax.get_children()[2].set_color('#004c6d')
ax.get_children()[3].set_color('#004c6d')
ax.get_children()[4].set_color('#94bed9')
ax.get_children()[5].set_color('#94bed9')
ax.get_children()[6].set_color('#94bed9')
ax.get_children()[7].set_color('#94bed9')
ax.get_children()[8].set_color('#6996b3')
ax.get_children()[9].set_color('#6996b3')
ax.get_children()[10].set_color('#6996b3')
ax.get_children()[11].set_color('#6996b3')
ax.legend(['Negative','Positive'], frameon=False)

# saving figure
fig.savefig('sentiment.png', dpi=500)

In [None]:
df_select.groupby('tag')['compound'].describe()

## Prediction - Logistic Regression

### Baseline model - only BoW + no grid search

In [None]:
# selecting features
X = df_select[['lyrics_clean','unique_words','word_count','neg', 'pos']]

# target variable
y = df_select['tag']

# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

mapper = DataFrameMapper([
    #(['unique_words','word_count'], None),
    ('lyrics_clean',CountVectorizer())
])
count_train= mapper.fit_transform(X_train)
count_test = mapper.transform(X_test)

# classifier and fitting
LogReg = LogisticRegression()
LogReg.fit(count_train, y_train)

# prediction
pred = LogReg.predict(count_test)

print('Accuracy score of:' + str(metrics.accuracy_score(y_test,pred)))
print(metrics.confusion_matrix(y_test,pred, labels = ['pop','rock','rap','country']))
print(classification_report(y_test, pred, labels = ['pop','rock','rap','country']))

In [None]:
table1 = pd.DataFrame(metrics.confusion_matrix(y_test,pred, labels = ['pop','rock','rap','country']))
cols = ['Pop','Rock','Rap','Country']
table1.columns = cols
table1.index = cols 
table1.to_csv('table1.csv')

### Only BoW + grid search on hyper parameters

In [None]:
# penalty space
penalty = ['l1', 'l2']

# hyperparameter space
C = = np.logspace(0, 4, 10)

# hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# 5-fold cross validation
clf = GridSearchCV(LogReg, hyperparameters, cv=5, verbose=0)

best_model = clf.fit(count_train, y_train)

best_model.best_params_

In [None]:
# selecting features
X = df_select[['lyrics_clean','unique_words','word_count','neg', 'pos']]

# target variable
y = df_select['tag']

# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

mapper = DataFrameMapper([('lyrics_clean',CountVectorizer())])
count_train= mapper.fit_transform(X_train)
count_test = mapper.transform(X_test)

# classifier and fitting
LogReg = LogisticRegression(C = 1.0, penalty = 'l1')
LogReg.fit(count_train, y_train)

# prediction
pred = LogReg.predict(count_test)

print('Accuracy score of:' + str(metrics.accuracy_score(y_test,pred)))
print(metrics.confusion_matrix(y_test,pred, labels = ['pop','rock','rap','country']))
print(classification_report(y_test, pred))

In [None]:
table1 = pd.DataFrame(metrics.confusion_matrix(y_test,pred, labels = ['pop','rock','rap','country']))
cols = ['Pop','Rock','Rap','Country']
table1.columns = cols
table1.index = cols 
table1.to_csv('table2.csv')

### BoW + additional features and grid search on hyper parameters

In [None]:
# selecting features
X = df_select[['lyrics_clean','unique_words','word_count','neg', 'pos']]

# target variable
y = df_select['tag']

# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

mapper = DataFrameMapper([
    (['unique_words','word_count','neg', 'pos'], None),
    ('lyrics_clean',CountVectorizer())
])
count_train= mapper.fit_transform(X_train)
count_test = mapper.transform(X_test)


# classifier and fitting
LogReg = LogisticRegression(C = 1.0, penalty = 'l1')
LogReg.fit(count_train, y_train)

# prediction
pred = LogReg.predict(count_test)

print('Accuracy score of:' + str(metrics.accuracy_score(y_test,pred)))
print(metrics.confusion_matrix(y_test,pred, labels = ['pop','rock','rap','country']))
print(classification_report(y_test, pred))

In [None]:
# penalty space
penalty = ['l1', 'l2']

# hyperparameter space
C = np.logspace(0, 4, 10)

# hyperparameter options
hyperparameters = dict(C=C, penalty=penalty)

# 5-fold cross validation
clf = GridSearchCV(LogReg, hyperparameters, cv=5, verbose=0)

best_model = clf.fit(count_train, y_train)

best_model.best_params_

In [None]:
table1 = pd.DataFrame(metrics.confusion_matrix(y_test,pred, labels = ['pop','rock','rap','country']))
cols = ['Pop','Rock','Rap','Country']
table1.columns = cols
table1.index = cols 
table1.to_csv('table3.csv')