In [1]:
import pandas as pd
import string
from typing import List

In [2]:
songs = pd.read_pickle('atu_cleaned/songs.pickle')
songs = songs.sort_values('name')

In [3]:
VALID_CHARS = [' '] + list(string.ascii_lowercase) + list('0123456789')
VOWELS = list('aeiou')

# https://stackoverflow.com/a/38760564
units = ["zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
        "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
        "sixteen", "seventeen", "eighteen", "nineteen"]
units.reverse()
UNITS = {units[i] : 19-i for i in range(len(units))}
tens = ["", "", "twenty", "thirty", "forty", "fifty", "sixty", "seventy", "eighty", "ninety"]
TENS = {tens[i] : i for i in range(2,len(tens))}

In [4]:
def clean_song_name(song_name: str) -> str:
    """Return the song name after it is cleaned"""
    
    # lower case and remove invalid chars
    song_name = song_name.lower() 
    song_name = song_name.replace(' & ', ' n ')
    song_name = song_name.replace("/", ' ')
    song_name = ''.join([i for i in song_name if i in VALID_CHARS])
    
    # abbreviations
    abbreviations = {'chapter' : 'ch',
                     'cross' : 'x',
                     'avenue' : 'ave',
                     'example': 'ex',
                     'birthday': 'bday',
                     'doctor': 'dr',
                     'halloween': 'halow'}
    for k, v in abbreviations.items():
        song_name = song_name.replace(k,v)
    
    song_name = song_name.split()
    
    # word replacement abbreviations
    abbreviations = {'and' : 'n'}
    for k, v in abbreviations.items():
        song_name = replace(k, str(v), song_name)
    
    # map string numbers
    for k, v in TENS.items():
        song_name = replace(k, str(v), song_name)
    for k, v in UNITS.items():
        song_name = replace(k, str(v), song_name)
    song_name = replace('eightyfive', '85', song_name)
    song_name = replace('fortysix', '46', song_name)

    song_name = '<' + ' '.join(song_name) + '>'
    song_name = song_name.split()
    
    # roman numerals -> digits
    song_name = replace('i>', '1', song_name)
    song_name = replace('ii>', '2', song_name)
    song_name = replace('iii>', '3', song_name)
    
    song_name = ' '.join(song_name)
    song_name = song_name.strip('<>')
    
    return song_name
    
def replace(old: str, new: str, lst: List[str]) -> List[str]:
    """Replace instances of old with new in the given list."""
    return [new if x==old else x for x in lst]

In [5]:
# keep track of the frequency of words in english
with open('data/common_english_words.txt') as f:
    words = [w.rstrip() for w in f.readlines()]
freq = {words[i] : i for i in range(len(words))}
def word_freq(word: str) -> int:
    if word in freq:
        rank = freq[word]
    else:
        rank = len(freq)
    if word[-1] == 's':
        return min(rank, word_freq(word[:-1]))
    return rank

# keep track of the frequency of words in song names
song_names = [x for s in songs['name'] for x in clean_song_name(s).split()]
word_freq_in_names = pd.Series(song_names).value_counts().to_dict()

In [6]:
def generate_code(name: str, blacklist: List[str]) -> str:
    """Given a song name, try to generate a song code."""
    
    words = name.split()
    words_by_freq = sorted(words, key=lambda x: word_freq(x), reverse=True)
    words_by_name_freq = sorted(words, key=lambda x: word_freq_in_names[x])
    
    # 5 characters or less (without spaces)
    # Check if 4-5 chars before removing vowels then after
    word = name.replace(' ', '')
    if (len(word) < 6):
        return word
    else:
        code = word[0] + "".join([c for c in word[1:] if c not in VOWELS])
        if ((len(word) < 8) & (len(code) in [4,5])):
            return code
        
    # Exactly 4 or 5 numbers
    code = ''.join([i for i in name if i in '0123456789'])
    if (len(code) in [4,5]):
        return code
        
    # First five letter acronym for songs with more than 5 words
    if (len(words) > 5):
        return "".join([w[0] for w in words if word_freq(w) > 10][:5])
           
    # Contains word unique in all song names (4-5 chars)
    for word in words_by_name_freq:
        if word in blacklist: 
            continue
        if ((word_freq_in_names[word] == 1) & (len(word) in [4,5])):
            return word
    
    # Contains very unique word (4-5 chars)
    for word in words_by_freq:
        if word in blacklist: 
            continue
        if (word_freq(word) > 2000) & (len(word) in [4,5]):
            return word

    # Contains word unique in all song names (4-5 chars no vowels)
    for word in words_by_name_freq:
        if word in blacklist: 
            continue
        code = word[0] + "".join([c for c in word[1:] if c not in VOWELS])
        if ((word_freq_in_names[word] == 1) & (len(code) in [4,5])):
            return code
    
    # Contains very unique word (4-5 chars no vowels)
    for word in words_by_freq:
        if word in blacklist: 
            continue
        code = word[0] + "".join([c for c in word[1:] if c not in VOWELS])
        if ((word_freq(word) > 100) & (len(code) in [4,5])):
            return code
        
    # Acronym for songs with 4-5 words
    if (len(words) in [3,4,5]):
        return "".join([w[0] for w in words][:5])
    
    # First 5 letters of longest word
    return sorted(words, key=lambda x: len(x), reverse=True)[0].replace(" ","")[:5]

In [7]:
song_codes = {}
blacklist = []
for index, row in songs.iterrows():
    song_title = row['name']
    is_original = bool(row['original'])
    name = clean_song_name(song_title)    
    code = generate_code(name, blacklist)
    blacklist.append(code)
    song_codes[song_title] = code

In [8]:
print('%s' % len(song_codes))
print('%0.2f%%' % ((len(song_codes) / len(songs))*100) )
code_freq = pd.Series(song_codes.values()).value_counts().to_dict()
print(sum([1 for x in code_freq.values() if x == 1]))
print(sum([1 for x in code_freq.values() if x == 1]) / len(song_codes) * 100)
{k : [n for n in songs['name'] if song_codes[n] == k] for k,v in code_freq.items() if v > 1}

975
100.00%
903
92.61538461538461


{'rght': ["It's All Right", 'Make It Right', 'Right Off'],
 'blck': ['Back In Black', 'Black Water', 'In The Black'],
 'nght': ['All Night Long', 'Night Nurse', 'Still of the Night'],
 'anthm': ['Anthem', 'Bullshit Anthem', 'The National Anthem'],
 'thrs': ["There's No Crying In Mexico", 'Theresa'],
 'frdm': ["Freedom of '76", "Freedom! '90"],
 'sungl': ['Cheap Sunglasses', 'Sweet Sunglasses'],
 'stran': ['Strangletage', 'The Stranger'],
 'amrcn': ['Great American', "We're An American Band"],
 'nghts': ['Ace of Long Nights', 'Hollywood Nights'],
 'sweet': ['Home Sweet Home', 'Sweetness'],
 'clsr': ['Closer', 'Come Closer'],
 'wght': ['The Weight', 'The Weight Around'],
 'alwys': ['Always Up', "She's Always a Woman"],
 'alrig': ['Alright', "Feelin' Alright"],
 'trpl': ['The Triple Wide', 'The Triple Wide/Robot Rock'],
 'phils': ["Phil's 7 Nation Money Farm", "Phil's Farm"],
 'strut': ['"Mrs Robinson\'s Strut"', 'Gut Strut'],
 'lvng': ['Living In America', 'Living On a Farm'],
 'brnn': [

In [9]:
{v:k for (k,v) in song_codes.items()}

{'jake': '"Brendan & Jake switch rigs"',
 'strut': 'Gut Strut',
 '19': '#19',
 '5': '#5',
 'dont': "(Don't Fear) The Reaper",
 'ntrt': '(Night Time Is) The Right TIme',
 'sdb': "(Sittin' On) The Dock of the Bay",
 'jstc': '...And Justice for All',
 'snshn': '...And We Became Sunshine',
 '1000': '1000 Places to See Before You Die',
 '10th': '10th Grade',
 '13dys': '13 Days',
 '1348': '1348',
 '1901': '1901 Jump',
 '1999': '1999',
 '2564': '25 or 6 to 4',
 'self': '2nd Self',
 '2x2': '2x2',
 '316': '316',
 'theme': "40's Theme",
 '5wlyl': '50 Ways to Leave Your Lover',
 '515': '5:15',
 '930': '9:30',
 'fifth': 'A Fifth of Beethoven',
 'agogo': 'A Go Go',
 'sleep': 'A Half Sleep',
 'sprm': 'A Love Supreme',
 'mild': 'A Mild Sedative',
 'mdly': 'AC/DC Medley',
 'abcb': 'Abacab',
 'nghts': 'Hollywood Nights',
 'kiss': 'Addicted to Kiss',
 'afrc': 'Africa',
 'after': 'After Midnight',
 'odds': 'Against All Odds',
 'ahab': 'Ahab',
 'anfih': "Ain't No Fun (If the Homies Can't Have None)",
 'ai

In [10]:
pd.Series([len(x) for x in song_codes.values()]).value_counts()

5    482
4    407
3     82
1      2
2      2
dtype: int64