In [46]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import re
import string
import csv

def preprocess_melon(pwd):
    # import data
    melon = pd.read_csv(pwd, encoding='cp949', keep_default_na=False, na_values="").iloc[:,1:]

    # drop duplicates
    melon.drop_duplicates(inplace=True)

    # convert data
    melon.flac.fillna('No_Flac', inplace=True)
    melon.like = melon.like.str.replace("[^0-9]", "")
    melon.reply = melon.reply.str.replace("[^0-9]", "")

    # drop unnecessary features
    melon.drop(["lyricist", "composer", "arranger"], axis=1, inplace=True)

    # delete rows with null
    temp = melon.isnull().any(axis=1)
    melon = melon.loc[-temp,:]

    # delete rows with no like
    temp = melon.like == ""
    melon = melon.loc[-temp,:]

    # convert type of features
    melon[["like", "reply"]] = melon[["like", "reply"]].astype("int64")

    # delete unnecessary genres
    temp = melon.genre.value_counts() > 10
    genre_list = melon.genre.value_counts()[temp].index

    temp = ["Korean Traditional", "Pop", "Vocal/Choral", "가톨릭음악", "국내CCM", "기타", "동요", "워십", "창작동요"]
    genre_list = [item for item in genre_list if item not in temp]

    temp = melon.genre.isin(genre_list)
    melon = melon.loc[temp,:]
    
    # compress genre
    melon.genre = ["Animation / Game" if item in ["Animation", "Game"] else   
                   "Crossover / Musical" if item in ["Crossover", "Musical"] else
                   "Drama / Korean Movie" if item in ["Drama", "Korean Movie"] else
                   "Blues / Jazz / New Age" if item in ["Blues", "Jazz", "New Age"] else
                   "Electronica / Rock" if item in ["Electronica", "Rock", "Electronica,Rock"] else
                   item for item in melon.genre]
    
    # make features "year", "mon", "season"
    melon["year"] = [item[:4] for item in melon.date]
    melon["mon"] = [item[5:7] for item in melon.date]
    melon["season"] = ["spring" if item in ["03", "04", "05"] else
                       "summer" if item in ["06", "07", "08"] else
                       "fall" if item in ["09", "10", "11"] else
                       "winter" for item in melon.mon]
    
    # drop unnecessary features
    melon.drop(["date", "mon"], axis=1, inplace=True)
    return melon

def match_genie(pwd):
    # import data
    genie = pd.read_csv(pwd, encoding='cp949', keep_default_na=False, na_values="").iloc[:,2:]
    
    # drop duplicates
    genie.drop_duplicates(inplace=True)
    
    return genie



if __name__ == "__main__":
    # directory of melon.csv
    pwd_melon = "./bind_data_melon/melon.csv"
    pwd_genie = "./bind_data_genie/my.csv"
    
    # preprocess data 
#     melon = preprocess_melon(pwd_melon)
#     melon.to_csv("./bind_data_melon/preprocessed_melon.csv", index=False)
    
    # select columns with null
#     melon.isnull().any(axis=0)
    
    # match genie with melon
    genie = match_genie(pwd_genie)

In [47]:
genie.head()

Unnamed: 0,Title,Artist,Album,Cnt
0,매일 듣는 노래 (A Daily Song),황치열,Be ordinary,81
1,빌려줄게,신용재 (포맨),EMPATHY,71
2,들리나요,The One (더원),들리나요,63
3,숨쉬는 모든 날,범키,수상한 파트너 OST Part.6 (SBS 수목드라마),62
4,봄날의 소나기 (Paper Umbrella),예성 (YESUNG),Spring Falling - The 2nd Mini Album,56
