In [4]:
import numpy as np
import pandas as pd


def genres_separate(genre_ids):
    """
    Each genre represented by a three-digit number, and there are songs 
    have more than one genres. We want to separate them. 
    Input: column"genre_ids" of song.csv file.
    Output: devided all possible categories existed for each of the song.
            return a dictionary where key=distinguish genre_id,
            value=number of songs belongs to that specific genre_id.
    """
    genre_dictionary = {}
    for genre_id in genre_ids:
        if type(genre_id) != str:
            continue
        genre_list = genre_id.split('|')
        for genre in genre_list:
            if genre not in genre_dictionary:
                genre_dictionary[genre] = 1
            else:
                genre_dictionary[genre] += 1
    
    return genre_dictionary      
        

# test if the above function works
songs = pd.read_csv('songs.csv') 
genre_ids = songs["genre_ids"]

genres_separate(genre_ids)


{'465': 589220,
 '444': 16097,
 '726': 36766,
 '864': 8393,
 '857': 7693,
 '850': 8382,
 '843': 8413,
 '458': 17857,
 '352': 5706,
 '1995': 4974,
 '2157': 1003,
 '359': 48144,
 '1609': 177258,
 '139': 56405,
 '873': 20513,
 '1955': 21426,
 '1011': 34620,
 '2022': 176531,
 '2122': 149608,
 '786': 59438,
 '947': 30232,
 '242': 14476,
 '1259': 103904,
 '921': 74983,
 '2107': 4967,
 '958': 182836,
 '880': 15430,
 '481': 1318,
 '125': 18733,
 '109': 20659,
 '798': 7104,
 '451': 13391,
 '1152': 65463,
 '940': 45604,
 '1082': 1684,
 '545': 4609,
 '437': 17441,
 '829': 13155,
 '430': 7507,
 '1969': 1835,
 '388': 27608,
 '94': 4714,
 '1572': 27311,
 '275': 25808,
 '409': 6568,
 '893': 7778,
 '1616': 26983,
 '712': 503,
 '2130': 11586,
 '2086': 11393,
 '374': 8849,
 '1568': 1211,
 '1138': 11050,
 '474': 1851,
 '1180': 11120,
 '1068': 959,
 '423': 12302,
 '184': 714,
 '744': 46,
 '691': 20248,
 '2189': 2674,
 '367': 2154,
 '719': 406,
 '2072': 7344,
 '1977': 1908,
 '402': 716,
 '1287': 1445,
 '20

In [8]:
def song_play_times(song_ids): 
    """
    We also want to know the frequencies of each song.
    input: distinct song info(ie song id).
    output: a dictionary with key=song_id, value=number of times it's played.
    or 
    We can also use a similar function for frequencies of languages.
    input: distinct language(each language represented by a different number).
    output: a dictionary with key=language, value=number of songs in this specific language.
    """
    song_play_dict = {}

    for song_id in song_ids:
        if song_id not in song_play_dict:
            song_play_dict[song_id] = 1
        else:
            song_play_dict[song_id] += 1
    
    return song_play_dict

# test if the above function works
train_data = pd.read_csv('train.csv') 
song_ids = train_data["song_id"]
song_play_times(song_ids)

{'BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=': 215,
 'bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=': 1,
 'JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=': 4,
 '2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=': 1,
 '3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=': 412,
 '3Hg5kugV1S0wzEVLAEfqjIV5UHzb7bCrdBRQlGygLvU=': 1108,
 'VkILU0H1h3NMmk9MQrXouNudGk5n8Ls5cqRRuBxeTh4=': 3869,
 'bPIvRTzfHxH5LgHrStll+tYwSQNVV8PySgA3M1PfTgc=': 287,
 '/bU6IRSK+YNlNbaTkxo7bhsb2EDLPrnksdX3ggcZNhI=': 31,
 'EbI7xoNxI+3QSsiHxL13zBdgHIJOwa3srHd7cDcnJ0g=': 738,
 't0aT90DlS1TGncgnKoL0SvfAWEr3Dl72QBVcokmKfLc=': 2,
 '8FGjC9W+7F8WjheGZPAwX9RH3+nWSO7DzjM6EB6naOI=': 875,
 'u6/Pb7X4u7KU4gXrBgGqt8RlRrNNFLn03tLAHyxRxwA=': 5740,
 'TYhx9eqWklddkLQlApQ5MS9jJCO4H3JHdpISZ3kZSRE=': 3251,
 'IgMar/mVrJQ+ODFPytDf7jwQMmR09+slyZUdVylRFLc=': 6830,
 '6HofPS0v2MVFsL10yCN7dXUL+gUOnvsD35vx3HmRbdE=': 3084,
 'a4TbK5V15pj3YZUOGa9h2U3t0OsE+3aiFw41mNlcgcw=': 2753,
 't95ClWf/B7Hi46sJeL70WJ75u7gLwaAp8o7LHuTtxl0=': 1194,
 '4ZISq5iNKgBGGW2OvKBBzBsYXRo

In [10]:
def song_language(languages):
    """"
    We can also use a similar function for frequencies of languages.
    input: distinct language(each language represented by a different number).
    output: a dictionary with key=language, value=number of songs in this specific language.
    """
    song_language_dict = {}

    for language in languages:
        if language not in song_language_dict:
            song_language_dict[language] = 1
        else:
            song_language_dict[language] += 1
    
    return song_language_dict

# test if the above function works
languages_songs = pd.read_csv('songs.csv') 
languages = languages_songs["language"]
song_language(languages)

{3.0: 106295,
 31.0: 39201,
 52.0: 1336694,
 17.0: 92518,
 10.0: 15482,
 -1.0: 639467,
 24.0: 41744,
 59.0: 8098,
 45.0: 14435,
 38.0: 2385,
 nan: 1}