In [116]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import re
import string
import csv
import time

from konlpy.tag import Okt
from gensim.models import Word2Vec
from gensim import corpora

def preprocess_melon(pwd):
    # import data
    melon = pd.read_csv(pwd, encoding='cp949', keep_default_na=False, na_values="").iloc[:,1:]

    # drop duplicates
    melon.drop_duplicates(inplace=True)

    # convert data
    melon.flac.fillna('No_Flac', inplace=True)
    melon.like = melon.like.str.replace("[^0-9]", "")
    melon.reply = melon.reply.str.replace("[^0-9]", "")

    # drop unnecessary features
    melon.drop(["lyricist", "composer", "arranger"], axis=1, inplace=True)

    # delete rows with null
    temp = melon.isnull().any(axis=1)
    melon = melon.loc[-temp,:]

    # delete rows with no like
    temp = melon.like == ""
    melon = melon.loc[-temp,:]

    # convert type of features
    melon[["like", "reply"]] = melon[["like", "reply"]].astype("int64")

    # delete unnecessary genres
    temp = melon.genre.value_counts() > 10
    genre_list = melon.genre.value_counts()[temp].index

    temp = ["Korean Traditional", "Pop", "Vocal/Choral", "가톨릭음악", "국내CCM", "기타", "동요", "워십", "창작동요"]
    genre_list = [item for item in genre_list if item not in temp]

    temp = melon.genre.isin(genre_list)
    melon = melon.loc[temp,:]
    
    # compress genre
    melon.genre = ["Animation / Game" if item in ["Animation", "Game"] else   
                   "Crossover / Musical" if item in ["Crossover", "Musical"] else
                   "Drama / Korean Movie" if item in ["Drama", "Korean Movie"] else
                   "Blues / Jazz / New Age" if item in ["Blues", "Jazz", "New Age"] else
                   "Electronica / Rock" if item in ["Electronica", "Rock", "Electronica,Rock"] else
                   item for item in melon.genre]
    
    # make features "year", "mon", "season"
    melon["year"] = [item[:4] for item in melon.date]
    melon["mon"] = [item[5:7] for item in melon.date]
    melon["season"] = ["spring" if item in ["03", "04", "05"] else
                       "summer" if item in ["06", "07", "08"] else
                       "fall" if item in ["09", "10", "11"] else
                       "winter" for item in melon.mon]
    
    # drop unnecessary features
    melon.drop(["date", "mon"], axis=1, inplace=True)
    
    return melon

def match_genie(pwd, melon):
    # import data
    genie = pd.read_csv(pwd, encoding='cp949', keep_default_na=False, na_values="").iloc[:,2:]
    
    # drop duplicates
    genie.drop_duplicates(inplace=True)
    
    # merge
    melon["title_artist"] = ["_".join(item) for item in zip(melon.title, melon.artist)]
    genie["title_artist"] = ["_".join(item) for item in zip(genie.title, genie.artist)]
    genie = genie.iloc[:,-2:]
    
    df = pd.merge(melon, genie, how="left", on="title_artist")
    df.cnt.fillna(0, inplace=True)
    df.cnt = df.cnt.astype("int64")
    del df["title_artist"]
    
    return df

# pos tagging
def pos_tag(df):
    tic = time.time()
    
    t = Okt()
    lyric = [t.pos(item) for item in df.lyric]
    df.lyric = lyric
    
    toc = time.time()
    print("pos_tag() time:", (toc-tic)/60)
    return df

# pos filtering
def pos_filter(df):
    tic = time.time()
    
    lyric = []
    for items in df.lyric:
        words = []
        for item in items:
            try:
                if item[1] in ['Noun', 'Verb', 'Adjective']:
                    words.append(item[0])
            except:
                pass
        lyric.append(words)
    df.lyric = lyric
    
    toc = time.time()
    print("pos_filter() time:", (toc-tic)/60)
    return df

# remove stopwords and lyrics containing under 50 words
def filter_stopwords_and_minority(df):
    tic = time.time()
    
    f = open('bind_data_melon/stopwords_list_809.txt', 'r', encoding='utf-8')
    stop_words = f.read().split(",")
    f.close()
    
    lyric = []
    include = []
    i = 0
    
    for items in df.lyric:
        words = []
        for item in items:
            if item not in stop_words:
                words.append(item)
        lyric.append(words)
        
        if len(words) >= 50:
            include.append(i)
        i += 1
    
    lyric = Series(lyric).iloc[include]
    df = df.iloc[include,:]
    df.lyric = lyric
    
    toc = time.time()
    print("filter_stopwords_and_minority() time:", (toc-tic)/60)
    return df

def word_embedding(df):
    model = Word2Vec(df.lyric, size=50, window=10, min_count=5, workers=4)
    model.save('bind_data_melon/word2vec')

    dic_ko = corpora.Dictionary(df.lyric)
    dic_ko.save('bind_data_melon/corpus.dict')
    
    '''
    model = Word2Vec.load('bind_data_melon/word2vec')
    corpus = corpora.Dictionary.load('bind_data_melon/corpus.dict')
    '''
    return model

if __name__ == "__main__":
    # directory of melon.csv
    pwd_melon = "./bind_data_melon/melon.csv"
    pwd_genie = "./bind_data_genie/my.csv"
    
    # preprocess data
#     melon = preprocess_melon(pwd_melon)
    #melon.to_csv("./bind_data_melon/preprocessed_melon.csv", index=False)
    
    # select columns with null
    #melon.isnull().any(axis=0)
    
    # match genie with melon
#     df = match_genie(pwd_genie, melon)
    #df.to_csv('bind_data_melon/melon_df.csv', encoding='euc-kr')
    
    # text preprocessing
#     df = pos_tag(df)
#     df = pos_filter(df)
#     df = filter_stopwords_and_minority(df)
    #df.to_csv('bind_data_melon/melon2_df.csv', encoding='euc-kr')
    
#     model = word_embedding(df)
    
    # 
    playlistSet = df.loc[df.cnt != 0,:]
    recommendSet = df.loc[df.cnt == 0,:]



pos tagging time: 14.120912957191468
pos filtering time: 0.07388568719228109
time to remove stopwords and lyrics containing under 50 words: 1.735190705458323


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


In [117]:
df.head()

Unnamed: 0,title,artist,album,genre,flac,like,reply,lyric,year,season,cnt
0,더 (PROD. 브라더수),유승우,ROMANCE,Ballad,Flac 16bit,975,6,"[더, 가까워지지, 말, 정도, 다치고, 않아, 그렇잖아, 똑같은, 맘, 이쯤, 멈...",2017,fall,0
1,아주 칭찬해,타린,아주 칭찬해,Folk,Flac 16/24bit,134,1,"[바람, 불어와서, 상쾌한, 기분, 가벼운, 걸음, 걸음, 청아한, 하늘, 바라본다...",2017,fall,0
3,OK (Prod. by GRAY),BewhY (비와이),슬기로운 감빵생활 OST Part.1,Drama / Korean Movie,Flac 16/24bit,952,13,"[나인, 같니, 자유, 결여, 사는, 삶, 어제, 지워, 갔네, 행복한, 척, 해야...",2017,fall,0
4,WU & THE 1LLY (Feat. Inspectah Deck and Masta ...,"Dok2, The Quiett, 김효은",WU & THE 1LLY,Rap / Hip-hop,Flac 16bit,383,11,"[야망, 가져와, 달려가, 눈, 감어, 아련한, 과거, 땜, 쌓여가, 돈, 낙엽, ...",2017,fall,0
5,취.준.생. (나의 잘못이 아니라 말해주세요),박기영,취.준.생. (나의 잘못이 아니라 말해주세요),Blues / Jazz / New Age,Flac 16bit,118,4,"[요즘, 고민, 많아, 밤, 늦게까지, 잠, 못, 드는, 많아요, 마음, 놓고, 감...",2017,fall,0


In [134]:
for item in df.lyric:
    try:
        model.wv.most_similar(item)
        break
    except:
        pass