# Making predictions
Predicting artist gender and genre from song lyrics
A follow-up to [*Trucks and Beer*](http://www.johnwmillr.com/trucks-and-beer/), my textual analysis of 12k+ country song lyrics

---

## Cleaning + Formatting Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
import re
from collections import Counter

fs = 16 # fontsize

---
## Load the data

In [32]:
artists = json.load(open('data/lyrics/rap_rock_country.json'))

In [37]:
# Gender
labels_gender = np.array([a['gender'] for a in artists])
mask_female = np.array([g=='female' for g in labels_gender])
mask_male   = np.array([g=='male'   for g in labels_gender])
print("You have lyrics from {} artists.\nFemale: {}, male: {}"
      .format(len(artists), (labels_gender=='female').sum(), (labels_gender=='male').sum()))

def song_is_lyrics(title):
    """Returns False if song is not actually song lyrics"""
    regex = re.compile(
        r"(tracklist)|(track list)|(album art(work)?)|(liner notes)|(booklet)|(credits)|(remix)|(interview)|(skit)|(a collection)", re.IGNORECASE)
    return not regex.search(title)

# Add all song lyrics to a single list
all_songs, genres = [], []
for artist, gender in zip(artists, labels_gender):
    for song in artist['songs']:        
        song['gender'] = gender
        if len(song['lyrics'].split(' ')) > 10 and song_is_lyrics(song['title']):
            all_songs.append(song)
            if song['genre'] not in genres:
                genres.append(song['genre'])
              
song_count = len(all_songs)
print("Database contains {} songs.".format(song_count))

# Store all lyrics in a single string
all_lyrics = " ".join([song['lyrics'] for song in all_songs])

You have lyrics from 303 artists.
Female: 79, male: 222
Database contains 55013 songs.


In [38]:
# Save the cleaned song lyrics in JSON format
with open('data/lyrics/all_songs.json', 'w') as outfile:
    json.dump(all_songs, outfile)

### Read in the cleaned JSON data as a DataFrame

In [2]:
songs = pd.read_json('./data/lyrics/all_songs.json')