In [None]:
import pandas as pd
from ast import literal_eval
from konlpy.tag import Kkma
from konlpy.tag import Okt
from konlpy.tag import Mecab

In [None]:
from torch.utils.data import Dataset, DataLoader
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim

import gluonnlp as nlp
import json

import numpy as np
import pandas as pd

import torch.nn.functional as F
import torch.optim as optim
from tqdm.notebook import tqdm

from transformers import AdamW
from transformers.optimization import get_cosine_schedule_with_warmup
from kobert_tokenizer import KoBERTTokenizer
from transformers import BertModel

In [None]:
topic = pd.read_csv('./final_mbti_topic.csv',index_col=0)
mbti = pd.read_csv('./mbti_final_data.csv',index_col=0)

In [None]:
mbti_list = ['ISTJ','ISFJ','INFJ','INTJ',
                        'ISTP','ISFP','INFP','INTP',
                        'ESTP','ESFP','ENFP','ENTP',
                        'ESTJ','ESFJ','ENFJ','ENTJ']

In [None]:
def split_text(text):
    text = text.split('.')
    text_list = []
    for t in text:
        text_list.append(t.strip())
    return text_list

In [None]:
kkma = Kkma()

In [None]:
max_len = 64
batch_size = 64
warmup_ratio = 0.1
num_epochs = 5
max_grad_norm = 1
log_interval = 200
learning_rate =  5e-5

In [None]:
tokenizer = KoBERTTokenizer.from_pretrained('skt/kobert-base-v1')
bertmodel = BertModel.from_pretrained('skt/kobert-base-v1', return_dict=False)
vocab = nlp.vocab.BERTVocab.from_sentencepiece(tokenizer.vocab_file, padding_token='[PAD]')
device = torch.device("cuda:0")
tok = tokenizer.tokenize

In [None]:
class BERTClassifier(nn.Module):
    def __init__(self,
                 bert,
                 hidden_size = 768,
                 num_classes=2,
                 dr_rate=None,
                 params=None):
        super(BERTClassifier, self).__init__()
        self.bert = bert
        self.dr_rate = dr_rate
                 
        self.classifier = nn.Linear(hidden_size , num_classes)
        if dr_rate:
            self.dropout = nn.Dropout(p=dr_rate)
    
    def gen_attention_mask(self, token_ids, valid_length):
        attention_mask = torch.zeros_like(token_ids)
        for i, v in enumerate(valid_length):
            attention_mask[i][:v] = 1
        return attention_mask.float()

    def forward(self, token_ids, valid_length, segment_ids):
        attention_mask = self.gen_attention_mask(token_ids, valid_length)
        
        _, pooler = self.bert(input_ids = token_ids, token_type_ids = segment_ids.long(), attention_mask = attention_mask.float().to(token_ids.device))
        if self.dr_rate:
            out = self.dropout(pooler)
        return self.classifier(out)

In [None]:
class BERTDataset(Dataset):
    def __init__(self, dataset, sent_idx, label_idx, bert_tokenizer, vocab, max_len,
                 pad, pair):
        transform = nlp.data.BERTSentenceTransform(
            bert_tokenizer, max_seq_length=max_len, vocab=vocab, pad=pad, pair=pair)

        self.sentences = [transform([i[sent_idx]]) for i in dataset]
        self.labels = [np.int32(i[label_idx]) for i in dataset]

    def __getitem__(self, i):
        return (self.sentences[i] + (self.labels[i], ))

    def __len__(self):
        return (len(self.labels))

In [None]:
test_model = torch.load('./model.pt')

In [None]:
def predict(text):
    data = [text, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    test_model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = test_model(token_ids, valid_length, segment_ids)

        test_eval=[]
        for i in out:
            logits=i
            logits = logits.detach().cpu().numpy()

            if np.argmax(logits) == 0:
                test_eval.append("부정")
            elif np.argmax(logits) == 1:
                test_eval.append("긍정")
            else:
                test_eval.append("??")

    # print(">> 입력하신 내용에서 " + test_eval[0] + " 느껴집니다.")
    return np.argmax(logits)

In [None]:
def keyword_match(keyword,text):
    number_list = []
    for i,t in enumerate(text):
        if keyword in t:
            number_list.append(i)
    return number_list

In [None]:
def mbti_out_model(mbti,keyword,num_list,clean):
    out_dict = {f'{mbti}':{keyword:[0,0]}}
    for i,number in enumerate(num_list):
        # if i==10: break
        out = predict(clean[number])
        out_dict[mbti][keyword][out]+=1
    return out_dict

In [None]:
def predict(text):
    data = [text, '0']
    dataset_another = [data]

    another_test = BERTDataset(dataset_another, 0, 1, tok, vocab, max_len, True, False)
    test_dataloader = torch.utils.data.DataLoader(another_test, batch_size=batch_size, num_workers=5)

    test_model.eval()

    for batch_id, (token_ids, valid_length, segment_ids, label) in enumerate(test_dataloader):
        token_ids = token_ids.long().to(device)
        segment_ids = segment_ids.long().to(device)

        valid_length= valid_length
        label = label.long().to(device)

        out = F.softmax(test_model(token_ids, valid_length, segment_ids),dim=1)
        logits = out.detach().cpu().numpy()
        # print(logits)
        if logits[0][0]>0.7:
            return 0
        else:
            return 1


In [None]:
def read_stopwords(path='./TopicStopwords.txt'):
    stopwords=[]
    file = open(f"{path}", "r")
    while True:
        line = file.readline()
        if not line:
            break
        stopwords.append(line.strip())
    file.close()
    return stopwords
stopwords = read_stopwords()

In [None]:
mbti

In [None]:
mecab = Mecab()
t = literal_eval(mbti['contents'][9])

In [None]:
def ext_text(sentence):
    tagged = mecab.pos(sentence)
    nouns = [s for s, t in tagged if t in ['NNG', 'NNP', 'VA', 'XR'] and len(s) >1]
    nouns = [x for x in nouns if x not in stopwords]
    return nouns

In [None]:
def top_n(count_dict, reverse, n=3):
    return dict(sorted(count_dict.items(), reverse=reverse, key=lambda x: x[1])[:n])

In [None]:
from collections import Counter
mecab = Mecab()
def ext_negative(mbti_):
    text = literal_eval(mbti.loc[mbti_]['contents'])

    keyword=[]
    for k in text:
        keyword.extend(ext_text(k))
    count_keyword = Counter(keyword)
    n_label= []
    for t in tqdm(keyword[:13000]):
        label = predict(t)
        if label==0:
            n_label.append(t)
    
    count_negative = {i:count_keyword[i] for i in n_label if count_keyword[i]>0}
    top_negative = top_n(count_negative,True,n=50)
    return top_negative
    

In [None]:
negative_mbti_topic = {string : {} for string in mbti_list}
for mt in tqdm(mbti_list):
    mbti_dict = ext_negative(mt)
    negative_mbti_topic[mt] = mbti_dict
with open('./negative_topic.json','w') as f:
    json.dump(negative_mbti_topic,
              f,ensure_ascii=False,indent=4)

In [None]:
negative_mbti_topic

In [None]:
def split_text(text):
    sentence = sum([],(text.split('.')))
    sentence = list(filter(None, sentence))
    return [s.strip() for s in sentence]

def make_sentence(mbti_):
    text = literal_eval(mbti.loc[mbti_]['contents'])
    sentence_list=[]
    for i in tqdm(text):
        sentence_list.extend(split_text(i))
    return sentence_list


In [None]:
import math
def sentence_distance_weight(top_n_k,mbti_topic,
                             sentence,n_number,w_dict,c_dict):
    for n in tqdm(n_number):
        s=ext_text(sentence[n])
        for n_t in top_n_k:
            if n_t in s:
                for m_t in mbti_topic:
                    if m_t in s:
                        if 5>len(s) or len(s)>12: continue
                        # if len(s)<13: continue
                        # if c_dict[m_t]==10: continue
                        v = s.index(n_t)-s.index(m_t)
                        if -2<v<0 or 0<v<2:
                        # if 0<v<2:
                            if m_t in '창의':
                                print(len(s))
                                print(f'negative={n_t},topic={m_t}')
                                print(s)
                                print(sentence[n])
                                print(s.index(n_t),s.index(m_t))
                            w_v = -1.5
                            if '않' in s:
                                if s.index("않")>s.index(m_t):
                                    w_v *= -1
                            if '필요' in s:
                                if s.index("필요")>s.index(m_t):
                                    w_v *= -1                          
                        else:
                            w_v = 0.1
                        w_dict[m_t]+=w_v/100
                        c_dict[m_t]+=1
    return w_dict

In [None]:
def topic_weight(mbti,negative_mbti_topic):
    # mbti_sentence_list = make_sentence(mbti)
    mbti_sentence_list = list(set(make_sentence(mbti)))
    
    k_d = {}
    for k in negative_mbti_topic[mbti]:
        n_number = keyword_match(k,mbti_sentence_list)
        if len(n_number)>0:
            k_d[k]=n_number

    n_number = sorted(set(sum(list(k_d.values()),[])))

    mbti_topic = list(topic[mbti].dropna().keys())
    w_dict = {string : 0 for string in mbti_topic}
    c_dict = {string : 0 for string in mbti_topic}
    weight = sentence_distance_weight(negative_mbti_topic[mbti],mbti_topic,
                            mbti_sentence_list,
                            n_number,w_dict,c_dict)
    return weight

In [None]:
import math
def sentence_distance_weight(n_t,mbti_topic,
                             sentence,n_number,w_dict,c_dict):
    for n in tqdm(n_number):
        s=ext_text(sentence[n])
        if n_t in s:
            for m_t in mbti_topic:
                if m_t in s:
                    if 5>len(s) or len(s)>12: continue
                    # if len(s)<13: continue
                    # if c_dict[m_t]==10: continue
                    v = s.index(n_t)-s.index(m_t)
                    if -2<v<0 or 0<v<2:
                    # if 0<v<2:
                        if m_t in '감정':
                            print(len(s))
                            print(f'negative={n_t},topic={m_t}')
                            print(s)
                            print(sentence[n])
                            print(s.index(n_t),s.index(m_t))
                        w_v = -1.5
                        if '않' in s:
                            if s.index("않")>s.index(m_t):
                                w_v *= -1
                        if '필요' in s:
                            if s.index("필요")>s.index(m_t):
                                w_v *= -1                          
                    else:
                        w_v = 1
                    w_dict[m_t]+=w_v/100
                    c_dict[m_t]+=1
    return w_dict

In [None]:
def topic_weight(mbti,negative_mbti_topic):
    # mbti_sentence_list = make_sentence(mbti)
    mbti_sentence_list = list(set(make_sentence(mbti)))
    
    k_d = {}
    for n in negative_mbti_topic:
        n_number = keyword_match(n,mbti_sentence_list)
        if len(n_number)>0:
            k_d[n]=n_number
    n_number = sorted(set(sum(list(k_d.values()),[])))

    mbti_topic = list(topic[mbti].dropna().keys())
    w_dict = {string : 0 for string in mbti_topic}
    c_dict = {string : 0 for string in mbti_topic}
    weight = sentence_distance_weight(negative_mbti_topic,mbti_topic,
                            mbti_sentence_list,
                            n_number,w_dict,c_dict)
    return weight

In [None]:
stopwords = read_stopwords('./topicStopwords.txt')
w = topic_weight(mbti_list[0],k)

In [None]:
z=top_n(w,False,160)

In [None]:
z

In [None]:
with open('./test2_topic.json','w') as f:
    json.dump(negative_mbti_topic,
              f,ensure_ascii=False,indent=4)

In [None]:
with open('./negative_topic.json', 'r') as f:
    json_data = json.load(f)
print(json.dumps(json_data,ensure_ascii=False))

In [None]:
import json
with open('./EndData/movie_topic.json', 'r') as f:
    json_data = json.load(f)
print(json.dumps(json_data,ensure_ascii=False))

In [4]:
json_data

{'시그널_80987077': {'김은희': '0.11179001',
  '훌륭': '0.044097003',
  '감독': '0.034989327',
  '부분': '0.02732882',
  '취향': 0.032388373,
  '아쉬움': '0.021212103',
  '명품': 0.033709268,
  '평가': 0.036274666,
  '김원석': '0.01820356',
  '스릴': 0.031745851,
  '매력': 0.03336074,
  '캐스팅': '0.014408951',
  '극본': '0.013127106',
  '짜임': '0.01302009',
  '남자': '0.012593454',
  '개연': '0.011187734',
  '저격': '0.009767651',
  '연출': '0.008430909',
  '인물': '0.023899896',
  '이재한': '0.17317721',
  '형사': '0.16158855',
  '박해영': '0.061357014',
  '경위': '0.03374224',
  '각본': '0.032785393',
  '수현': '0.026945515',
  '대박': '0.024475675',
  '차수': '0.021392163',
  '사랑': 0.087158632,
  '영화관': 0.024526955,
  '능력': '0.013289112',
  '행복': 0.069647401,
  '느낌': '0.068425864',
  '음악': '0.0116603905',
  '집중': '0.0115233585',
  '대사': 0.059616956,
  '진웅': '0.010756522',
  '만점': 0.01910707,
  '메시지': '0.008728352',
  '미생': '0.008488498',
  '세상': '0.045707166',
  '장면': 0.079018643,
  '정의': '0.030474007',
  '간절': 0.046827529,
  '어색': '0.0238299

In [None]:
k = set(json_data[mbti_list[0]].keys())
# k = set
for kw in mbti_list:
    try:
        temp = set(json_data[kw].keys())
    except:
        print(kw)
    k = k&temp


In [None]:
kk=['거부']

In [None]:
k

In [None]:
def ext_text(sentence):
    # nouns = mecab.pos(sentence)
    tagged = mecab.pos(sentence)
    # nouns = [s for s, t in tagged if t in ['NNG', 'NNP', 'VA', 'XR'] and len(s) >1]
    # nouns = [s for s, t in tagged if t in ['VA+ETM','NNG', 
    #                                        'NNP', 'VA', 'XR','VX','VV'] and len(s) >1]
    nouns = [s for s, t in tagged if t in ['VA+ETM','NNG', 
                                           'NNP', 'VA', 'XR','VX','VV','VCN+EC']]
    # nouns = [x for x in nouns if x not in stopwords]
    return nouns

In [None]:
mbti_list[10]

In [None]:
stopwords = read_stopwords('./topicStopwords.txt')

In [None]:
stopwords = read_stopwords('./TopicStopwords.txt')
nouns = [x for x in json_data[mbti_list[3]] if x not in stopwords]
json_data[mbti_list[3]] = nouns

In [None]:
json_data[mbti_list[3]]

In [None]:
ext_text('이들은 직무에서 요구하는 이상을 생각하며 일을 하고 위기 상황에서도 침착하고 충동적으로 일을 처리 하지 않는 사람들이다')

In [None]:
literal_eval(pd.read_csv('./mbti_namuwiki.csv').iloc[0]['content'])

In [None]:
mecab.pos('하지만 감각적인 성격이라고 해서 반드시 추상적인 사고를 거부하는 것은 아닙니다')

In [None]:
고집
무시
의존
비교
오만
무례
방해
어렵
스트레스
후회
예민
민감
의심
포기
신경
피곤
불편
짜증
걱정
답답
귀찮
거부
외로움
상처
망상
공격
어리석
호불호
기안
무뚝뚝
허세
슈가
지루
불만
부정
게으름
독신주의
서툴
일관성