In [59]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import re
import string
import csv

def preprocess_melon(pwd):
    # import data
    melon = pd.read_csv(pwd, encoding='cp949', keep_default_na=False, na_values="").iloc[:,1:]

    # drop duplicates
    melon.drop_duplicates(inplace=True)

    # convert data
    melon.flac.fillna('No_Flac', inplace=True)
    melon.like = melon.like.str.replace("[^0-9]", "")
    melon.reply = melon.reply.str.replace("[^0-9]", "")

    # drop unnecessary features
    melon.drop(["lyricist", "composer", "arranger"], axis=1, inplace=True)

    # delete rows with null
    temp = melon.isnull().any(axis=1)
    melon = melon.loc[-temp,:]

    # delete rows with no like
    temp = melon.like == ""
    melon = melon.loc[-temp,:]

    # convert type of features
    melon[["like", "reply"]] = melon[["like", "reply"]].astype("int64")

    # delete unnecessary genres
    temp = melon.genre.value_counts() > 10
    genre_list = melon.genre.value_counts()[temp].index

    temp = ["Korean Traditional", "Pop", "Vocal/Choral", "가톨릭음악", "국내CCM", "기타", "동요", "워십", "창작동요"]
    genre_list = [item for item in genre_list if item not in temp]

    temp = melon.genre.isin(genre_list)
    melon = melon.loc[temp,:]
    
    # compress genre
    melon.genre = ["Animation / Game" if item in ["Animation", "Game"] else   
                   "Crossover / Musical" if item in ["Crossover", "Musical"] else
                   "Drama / Korean Movie" if item in ["Drama", "Korean Movie"] else
                   "Blues / Jazz / New Age" if item in ["Blues", "Jazz", "New Age"] else
                   "Electronica / Rock" if item in ["Electronica", "Rock", "Electronica,Rock"] else
                   item for item in melon.genre]
    
    # make features "year", "mon", "season"
    melon["year"] = [item[:4] for item in melon.date]
    melon["mon"] = [item[5:7] for item in melon.date]
    melon["season"] = ["spring" if item in ["03", "04", "05"] else
                       "summer" if item in ["06", "07", "08"] else
                       "fall" if item in ["09", "10", "11"] else
                       "winter" for item in melon.mon]
    
    # drop unnecessary features
    melon.drop(["date", "mon"], axis=1, inplace=True)
    
    return melon

def match_genie(pwd, melon):
    # import data
    genie = pd.read_csv(pwd, encoding='cp949', keep_default_na=False, na_values="").iloc[:,2:]
    
    # drop duplicates
    genie.drop_duplicates(inplace=True)
    
    # merge
    melon["title_artist"] = ["_".join(item) for item in zip(melon.title, melon.artist)]
    genie["title_artist"] = ["_".join(item) for item in zip(genie.title, genie.artist)]
    genie = genie.iloc[:,-2:]
    
    df = pd.merge(melon, genie, how="left", on="title_artist")
    df.cnt.fillna(0, inplace=True)
    df.cnt = df.cnt.astype("int64")
    del df["title_artist"]
    
    return df

def preprocess_lyric():
    
    return None


if __name__ == "__main__":
    # directory of melon.csv
    pwd_melon = "./bind_data_melon/melon.csv"
    pwd_genie = "./bind_data_genie/my.csv"
    
    # preprocess data
#     melon = preprocess_melon(pwd_melon)
    #melon.to_csv("./bind_data_melon/preprocessed_melon.csv", index=False)
    
    # select columns with null
    #melon.isnull().any(axis=0)
    
    # match genie with melon
#     df = match_genie(pwd_genie, melon)
    #df.to_csv('bind_data_melon/melon_df.csv', encoding='euc-kr')
    
    # text preprocessing
    
    

In [60]:
df.head()

Unnamed: 0,title,artist,album,genre,flac,like,reply,lyric,year,season,cnt
0,더 (PROD. 브라더수),유승우,ROMANCE,Ballad,Flac 16bit,975,6,더 가까워지지 말자 그냥 그 정도까지만 다치고 싶지 않아 너도 그렇잖아 똑같은 맘이...,2017,fall,0
1,아주 칭찬해,타린,아주 칭찬해,Folk,Flac 16/24bit,134,1,바람이 불어와서 상쾌한 날 기분 좋아 가벼운 걸음걸음 청아한 하늘을 바라본다 눈 감...,2017,fall,0
2,SWEATY (Feat. Crush),SAAY,SWEATY (Feat. Crush),R&B / Soul,Flac 16bit,479,6,천천히 더 가까워졌지 널 봤을 때 난 마치 어린 소녀 같았지 서두를 것 없이 난 이...,2017,fall,0
3,OK (Prod. by GRAY),BewhY (비와이),슬기로운 감빵생활 OST Part.1,Drama / Korean Movie,Flac 16/24bit,952,13,넌 너가 너 내가 나인 것 같니 자유가 결여되어 사는 삶이 You 나의 어제를 지워...,2017,fall,0
4,WU & THE 1LLY (Feat. Inspectah Deck and Masta ...,"Dok2, The Quiett, 김효은",WU & THE 1LLY,Rap / Hip-hop,Flac 16bit,383,11,Song gets rougher while I'm rhyming Pretty wom...,2017,fall,0
