### basic packages

In [8]:
import os
import sys
sys.path.append('..')
import nltk
import gensim
import sklearn
import seaborn as sns
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import re
import pickle
import os
import json
from sklearn.model_selection import train_test_split

### word_processing packages

In [2]:
from konlpy.tag import Komoran
from sklearn.metrics import pairwise_distances
from sklearn.metrics.pairwise import rbf_kernel
from nltk.tokenize import word_tokenize
from nltk.corpus   import stopwords
from gensim.models import Word2Vec
from sklearn.feature_extraction.text import CountVectorizer

### Read Data

In [3]:
df1 = pd.read_csv('./data/corona_6_9_utf.csv')

In [4]:
df2 = pd.read_csv('./data/corona_6_12_utf.csv')

In [5]:
df = pd.concat([df1,df2])

In [6]:
#중복 댓글 제거
df.drop_duplicates(['contents'],keep='first',inplace = True)

In [7]:
df.shape

(505743, 7)

In [44]:
df

Unnamed: 0.1,Unnamed: 0,X1,...1,contents,modTime,sympathyCount,antipathyCount
0,1,1.0,2,엄지척 2개를 드립니다.,2020-03-02 03:33:38,640,1
1,2,3.0,4,대기업이 대기업답게 국가적 위기에 통큰 행보 감사합니다.,2020-03-02 03:34:36,253,0
2,3,8.0,14,"문재인씨, 뭐 느낀것 없냐? 마스크 하나 제대로 못하는 주제에....삶은 소대가리라...",2020-03-02 04:00:17,17,6
3,4,12.0,18,정부보다 낫네요! 감사합니다.,2020-03-02 04:02:34,1,0
4,5,18.0,24,다른 대기업들도 동참 해주세요~ 대구 경북 국민들 구해주세요~,2020-03-02 04:07:10,1,0
...,...,...,...,...,...,...,...
264684,264685,355893.0,41,지나가는 개가 웃겠다 20대 국회에 출석율 꼴지인 당신이 할말은 아니지 그렇게 오만...,2020-05-29 02:24:56,1,1
264685,264686,355894.0,42,니가 왜 5 18 유공자냐,2020-05-29 02:26:04,1,0
264686,264687,355895.0,43,운동권 출신의 거지근성이 대한민국을 말아먹는구나,2020-05-29 02:26:28,2,1
264687,264688,355896.0,44,당신은 516때 뭘 하셧는가요? 유공자? 무공자? 진실하지 못한자가 뭘 남탓하는지...,2020-05-29 02:47:55,0,1


### 명사 추출기(Komoran)

In [46]:
#stop words 지정
c_stopwords = pd.read_excel('./stopwords/stopwords.xlsx')
my_stopwords = c_stopwords['stopwords'].tolist()
stopwords = my_stopwords + ['괜히','또또','사방','려면','다해','왜또','부터','지금','가가','가가호호','힌다','지라','그냥','이나','면서','진짜','정말','이제','때문','신들','안중','통령']

In [47]:
non_path = './stopwords/user_dict.txt'
komoran = Komoran(userdic = non_path)

In [51]:
komoran.nouns('매번 고개숙이고 겸손한 대통령님..항상 응원합니다')

['고개', '겸손', '대통령님', '응원']

In [None]:
# 명사 추출 함수
def get_nouns(corpus,stopwords=stopwords):
    nouns_list = list()
    for text in corpus:
        try : 
            text = str(text)
            text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!\\‘|\(\)\[\]\<\>`\'…》]', '', text)
            text = re.sub("[^가-힣A-Za-z]"," ", text)
            text = re.sub(r"\s{2,}", " ", text)
            nouns = komoran.nouns(text)
            nouns = [word for word in nouns if len(word) > 1 and word not in stopwords]
            nouns_list.append(nouns)
        except : 
            nouns_list.append('error')
    return nouns_list

In [None]:
news_corpus_v2 = get_nouns(df['contents'],stopwords=stopwords)

In [None]:
total_df = pd.DataFrame()
total_df['Article_token_v2'] = news_corpus_v2

### word2vec 학습

In [None]:
model = Word2Vec(total_df['Article_token_v2']  , min_count=3, window=5, iter=100, size=100, workers=4, sg=1)
model.init_sims(replace=True)
model.save("./Word2vec/word2vec_final.model")

In [22]:
model = Word2Vec.load('./Word2vec/word2vec_final.model')
word_model = model.wv.syn0

  


### keyword 지정

In [23]:
good_keyword = ['토착왜구당','황고환','황교활','일베','쪽빠리','네일베', '미통닭', '틀딱', '박그네','미똥당','일베충','친일','토왜당','친일파','수꼴','보수꼴통','핑크당']
bad_keyword = ['공산당','빨갱이','달창','문재앙','문천지','대깨문','좌파','지령', '북괴','짱깨','조선족','문가', '문씨', '문쩝쩝',  '빨갱이정권', '공산당정권', '이낙엽', '문페렴', '문폐렴','문빨갱이','문죄앙','문어벙','어벙이', '간경화', '페미정부', '구라미터', '조작미터',  '박원숭이', '반일선동', '문좌인', '문족', 
'개재앙', '문통이', '문재앙폐렴', '민좆당', '문여적']
total_keyword = good_keyword+bad_keyword

In [24]:
len(bad_keyword)

36

### 단어간 거리 구하기

In [25]:
def matrix_distance(model,lists):
    #extract indices
    index_lists = []
    for i in lists:
        inf = model.wv.index2word.index(i) 
        index_lists.append(inf)
    # cal word2vec matrix using kernel
    word2vec_mat = model.wv.syn0
    distance = pairwise_distances(word2vec_mat , metric='euclidean')
    dis_to_ker = rbf_kernel(distance,gamma =0.01)
    kernal_matrix = dis_to_ker[:,index_lists]
    return pd.DataFrame(kernal_matrix,columns = lists,index = model.wv.index2word).sort_index()

In [None]:
word2vec_matrix = matrix_distance(model,total_keyword)

In [42]:
with open('./Word2vec/word2vec_matrix.pickle','wb') as f:
    pickle.dump(word2vec_matrix,f)

In [43]:
word2vec_matrix.shape

(22781, 53)

### word2vec, tdm 내적 시키기 위해 데이터들을 5만개씩 나눔

In [None]:
df_tdm1 = df.iloc[:50000,:]
df_tdm2 = df.iloc[50000:100000,:]
df_tdm3 = df.iloc[100000:150000,:]
df_tdm4 = df.iloc[150000:200000,:]
df_tdm5 = df.iloc[200000:250000,:]
df_tdm6 = df.iloc[250000:300000,:]
df_tdm7 = df.iloc[300000:350000,:]
df_tdm8 = df.iloc[350000:400000,:]
df_tdm9 = df.iloc[400000:450000,:]
df_tdm10 = df.iloc[450000:,:]


In [None]:
def tokenizer(text, pos=["NNP","NNG","NNB","NP","NR"], stopword=stopwords):
    return [
        word for word, tag in komoran.pos(
            text    
            )
            if len(word) > 1 and tag in pos and word not in stopwords
        ]

###  word2vec,tdm 내적시켜 labeling

In [21]:
def make_tdm(df,word2vec_matrix,good_keyword,bad_keyword,stopwords):
  #관심 키워드 설정
    total_keyword = good_keyword+bad_keyword
  # 명사추출 vectorizer
    vectorizer = CountVectorizer(tokenizer=tokenizer,min_df=3)
  # 인터넷 명사 사전 load
    non_path = '/gdrive/My Drive/Word2vec/stopwords/user_dict.txt'
    komoran = Komoran(userdic = non_path)
  #명사추출
    inf_nouns = get_nouns(df['contents'],stopwords=stopwords)
  # tdm 추출
    inf_tdm = pd.DataFrame()
    inf_tdm['nouns'] = inf_nouns
    inf_tdm['nouns_to_str']=inf_tdm['nouns'].str.join(' ')
    tdm = vectorizer.fit_transform(inf_tdm['nouns_to_str'])
    tdm_features = vectorizer.get_feature_names()
    print(type(tdm))
    tdm_df = pd.DataFrame(tdm.toarray(),columns = tdm_features)
      # word2vec matrix와 tdm 교집합 명사 
    word2vec_matrix = word2vec_matrix.reindex(tdm_features)
    word2vec_matrix.dropna(inplace=True)
    tdm_df = tdm_df.reindex(list(word2vec_matrix.index),axis=1)
    tdm_df.dropna(inplace=True)
      # word2vec tdm 내적
    score_df = pd.DataFrame(np.dot(tdm_df,word2vec_matrix),columns=total_keyword)
      # 내적된 matrix good/bad로 나눔
    inf_good = score_df[good_keyword]
    inf_bad = score_df[bad_keyword]
      #상위 5개씩 추출
    inf_good_sum = []
    for i in np.sort(inf_good):
        max_5 = (i[-5:].sum())/5
        inf_good_sum.append(max_5)

    inf_bad_sum = []
    for i in np.sort(inf_bad):
        max_5 = (i[-5:].sum())/5
        inf_bad_sum.append(max_5)
    score_df['good_score'] = inf_good_sum
    score_df['bad_score'] = inf_bad_sum
      # 계산된 score 바탕으로 good/bad 분류
    df['good_score'] = score_df['good_score'].values
    df['bad_score'] = score_df['bad_score'].values
    df['Judgement'] = np.where((df['bad_score']) > (df['good_score']),'bad','good')

    return df

In [None]:
df_list = [df_tdm1,df_tdm2,df_tdm3,df_tdm4,df_tdm5,df_tdm6,df_tdm7,df_tdm8,df_tdm9,df_tdm10]

In [None]:
i=1
while i <11:
    print(i)
    for df in df_list:
        tdm = make_tdm(df,word2vec_matrix,good_keyword=good_keyword,bad_keyword=bad_keyword,stopwords=stopwords)
        tdm.to_excel('/gdrive/My Drive/Word2vec/result/tdm{}.xlsx'.format(i))
        i+=1
        print('save excel')

### tdm data merge 

In [None]:
filepath = './data/'

In [None]:
filename = os.listdir(filepath)

In [None]:
inf_df = pd.DataFrame()
for name in filename:
    full_name = os.path.join(filepath,name)
    print(full_name)
    df = pd.read_excel(full_name)
    inf_df = pd.concat([df,inf_df])

In [None]:
if not os.path.exists('./result'):
    os.makedirs('./result')
inf_df.to_csv('./result/total_tdm.csv',encoding='utf-8-sig')

In [None]:
df = pd.read_csv('./result/total_tdm.csv')

In [None]:
df['Judgement'] = np.where(df['Judgement'] == 'bad',1,0)

In [None]:
analysis_data = df['contents']

In [None]:
target = df['Judgement']

In [None]:
tr_x,ts_x,tr_y,ts_y = train_test_split(analysis_data,target, test_size=0.1, stratify=target,shuffle=True)

In [None]:
train = pd.concat([tr_x,tr_y],axis=1)

test = pd.concat([ts_x,ts_y],axis=1)

test.to_csv('./result/test.csv',encoding='utf-8-sig')

train.to_csv('./result/train.csv',encoding='utf-8-sig')