In [10]:
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re

In [2]:
def preprocessing(text):
    # 문제를 일으킬 수 있는 문자 제거
    bad_chars = {"\u200b": "", "…": " ... ", "\ufeff": ""}
    for bad_char in bad_chars:
        text = text.replace(bad_char, bad_chars[bad_char])
        
    error_chars = {"\u3000": " ", "\u2009": " ", "\u2002": " ", "\xa0":" "}
    for error_char in error_chars:
        text = text.replace(error_char, error_chars[error_char])
    
    # 이메일 제거
    text = re.sub(r"[a-zA-Z0-9+-_.]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", "[이메일]", text).strip()
    
    # "#문자" 형식 어절 제거
    text = re.sub(r"#\S+", "", text).strip()
    
    # "@문자" 형식 어절 제거
    text = re.sub(r"@\w+", "", text).strip()
    
    # URL 제거
    text = re.sub(r"(http|https)?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text).strip()
    text = re.sub(r"pic\.(\w+\.)+\S*", "", text).strip()
    
    # 뉴스 저작권 관련 텍스트 제거
    re_patterns = [
        r"\<저작권자(\(c\)|ⓒ|©|\(Copyright\)|(\(c\))|(\(C\))).+?\>",
        r"저작권자\(c\)|ⓒ|©|(Copyright)|(\(c\))|(\(C\))"
    ]
    
    for re_pattern in re_patterns:
        text = re.sub(re_pattern, "", text).strip()
    
    # 뉴스 내 포함된 이미지에 대한 레이블 제거
    text = re.sub(r"\(출처 ?= ?.+\) |\(사진 ?= ?.+\) |\(자료 ?= ?.+\)| \(자료사진\) |사진=.+기자 ", "", text).strip()
    
    # 중복 문자 처리
    # text = repeat_normalize(text, num_repeats=2).strip()
    
    # 문제를 일으킬 수 있는 구두점 치환
    punct_mapping = {"‘": "'", "₹": "e", "´": "'", "°": "", "€": "e", "™": "tm", "√": " sqrt ", "×": "x", "²": "2", "—": "-", "–": "-", "’": "'", "_": "-", "`": "'", '“': '"', '”': '"', '“': '"', "£": "e", '∞': 'infinity', 'θ': 'theta', '÷': '/', 'α': 'alpha', '•': '.', 'à': 'a', '−': '-', 'β': 'beta', '∅': '', '³': '3', 'π': 'pi', }
    for p in punct_mapping:
        text = text.replace(p, punct_mapping[p])

    reg = re.compile(r'[^\‘ \”\“\’가-힣0-9\!\?\.\,]')
    text = reg.sub('', text)
    
    
    # 연속된 공백 치환
    text = re.sub(r"\s+", " ", text).strip()
    
    # 개행 문자 "\n" 제거
    text = text.replace('\n', '')

   
    return text

In [4]:
with open('hobby_data_50.json', encoding='utf-8') as f:
    data = json.load(f)

In [18]:
hobby_dict = {}
for hobby in data:
  sentences = ' '.join(data[hobby])
  sentences = preprocessing(sentences).split('. ')
  sentences.sort(key = lambda x:len(x))
  s_len = [len(s) for s in sentences]
  try:
    sentences = sentences[s_len.index(10):s_len.index(200)+1]
  except:
    sentences = sentences[s_len.index(10):]
  hobby_dict[hobby] = list(set(sentences))

In [21]:
with open('hobby_preprocessed.json', 'w', encoding='utf-8') as f:
    json.dump(hobby_dict, f, ensure_ascii=False, indent='\t')

In [2]:
with open('hobby_preprocessed.json', encoding='utf-8') as f:
    data_pre = json.load(f)

In [11]:
with open('new_dataset.json', encoding='utf-8') as f:
    data_pre = json.load(f)

In [12]:
total = 0
for hobby in data_pre:
    print(f"{hobby}: {len(data_pre[hobby])}")
    total += len(data_pre[hobby])
print(f"Total: {total}")

등산: 11535
무에타이: 4353
클라이밍: 1691
수상스키: 3834
스키: 14154
배드민턴: 7516
요가: 8581
헬스: 5166
다꾸: 6023
유튜버되기: 4613
풍경사진찍기: 4258
하늘사진찍기: 2335
카톡이모티콘만들기: 7022
블로그쓰기: 7930
홈카페: 7849
연극보기: 7338
전시회구경: 2849
노래녹음하기: 4594
바이닐수집: 7903
과일청담그기: 3391
캠핑: 1772
둘레길걷기: 2499
레고조립: 1789
서핑: 2703
홈베이킹: 1724
k-pop 댄스: 819
디제잉배우기: 368
목공예: 1442
가죽공예: 1266
도예: 2113
수영: 2161
테니스: 1999
Total: 143590


In [79]:
keywords = {
            'E': ["바깥 외향 활발"],
            'I': ["실내 조용 혼자"],
            'N': ["생각 상상 "],
            'S': ["기분 느낌"],
            'F': ["감성 공감 감정"],
            'T': ["이성 이해"],
            'J': ["계획 오래"],
            'P': ["즉흥 잠깐"]
            }

In [29]:
def get_score(sentences, hobby, category):
    sentences = data_pre[hobby]
    s_len = len(sentences)
    compare = keywords[category]
    sentences = compare + sentences
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(sentences)
    val = cosine_similarity(tfidf_matrix[0:1],tfidf_matrix[1:]).tolist()[0]
    val.sort(reverse=True)
    total = s_len - val.count(0.0)
    try:
        return sum(val)/total
    except:
        return 0

In [80]:
tfidf_scores = {}
for hobby in data_pre:
    sentences = data_pre[hobby]
    scores = {}
    for category in keywords:
        scores[category] = get_score(sentences, hobby, category)
    tfidf_scores[hobby] = scores

data_result = {}
mbti_count = {}
mbti_count_2 = {}
for hobby in tfidf_scores:
    scores = tfidf_scores[hobby]
    try: E = scores['E']/(scores['E']+scores['I'])
    except: E = 0
    try: N = scores['N']/(scores['N']+scores['S'])
    except: N = 0
    try: F = scores['F']/(scores['F']+scores['T'])
    except: F = 0
    try: J = scores['J']/(scores['J']+scores['P'])
    except: J = 0
    norm_scores = [E, N, F, J]
    # norm_scores = [int(E*100), int(N*100), int(F*100), int(J*100)]
    u_type = ''
    if E >= 0.5: u_type += 'E'
    else: u_type += 'I'
    if N >= 0.5: u_type += 'N'
    else: u_type += 'S'
    if F >= 0.5: u_type += 'F'
    else: u_type += 'T'
    if J >= 0.5: u_type += 'J'
    else: u_type += 'P'
    if u_type not in mbti_count:
        mbti_count[u_type] = 1
    else:
        mbti_count[u_type] += 1
    for c in u_type:
        if c not in mbti_count_2:
            mbti_count_2[c] = 1
        else: mbti_count_2[c] += 1
    # print(f"{hobby}: {u_type} {norm_scores}")
    # data_result[hobby] = {"type":u_type, "score":norm_scores}
    data_result[hobby] = norm_scores

In [81]:
sorted_dict = sorted(mbti_count.items())
print(f"mbti count: {len(sorted_dict)}")
print(sorted_dict)
print(mbti_count_2)

mbti count: 14
[('ENFJ', 3), ('ENFP', 2), ('ENTP', 2), ('ESFJ', 3), ('ESFP', 1), ('ESTJ', 1), ('ESTP', 5), ('INFJ', 4), ('INTJ', 2), ('INTP', 1), ('ISFJ', 2), ('ISFP', 2), ('ISTJ', 2), ('ISTP', 2)]
{'I': 15, 'S': 18, 'F': 17, 'J': 17, 'P': 15, 'E': 17, 'N': 14, 'T': 15}


In [None]:
# entj infp 실종

In [134]:
data_result['mbtis'] = sorted_dict
data_result['category'] = mbti_count_2

In [9]:
with open('data_result_float.json', 'w', encoding='utf-8') as f:
    json.dump(data_result, f, ensure_ascii=False, indent='\t')

In [2]:
with open('data_result.json', encoding='utf-8') as f:
    data_result = json.load(f)

In [3]:
data_result

{'등산': {'type': 'ISFJ', 'score': [46, 0, 100, 50]},
 '무에타이': {'type': 'INFP', 'score': [0, 100, 100, 48]},
 '클라이밍': {'type': 'ISFJ', 'score': [46, 37, 100, 51]},
 '수상스키': {'type': 'INFJ', 'score': [46, 68, 100, 54]},
 '스키': {'type': 'ESFP', 'score': [55, 0, 60, 49]},
 '배드민턴': {'type': 'ESTP', 'score': [57, 0, 38, 46]},
 '요가': {'type': 'ISTP', 'score': [46, 0, 44, 47]},
 '헬스': {'type': 'ISFJ', 'score': [42, 0, 54, 50]},
 '다꾸': {'type': 'ISTJ', 'score': [0, 49, 30, 59]},
 '유튜버되기': {'type': 'ESTP', 'score': [59, 35, 44, 41]},
 '풍경사진찍기': {'type': 'INTP', 'score': [47, 59, 31, 46]},
 '하늘사진찍기': {'type': 'ESFJ', 'score': [52, 0, 100, 62]},
 '카톡이모티콘만들기': {'type': 'INTP', 'score': [0, 77, 45, 47]},
 '블로그쓰기': {'type': 'INTJ', 'score': [49, 56, 43, 56]},
 '홈카페': {'type': 'ESFP', 'score': [55, 0, 100, 40]},
 '연극보기': {'type': 'INTJ', 'score': [0, 53, 42, 51]},
 '전시회구경': {'type': 'ENTP', 'score': [50, 50, 35, 43]},
 '노래녹음하기': {'type': 'ESFJ', 'score': [58, 0, 100, 56]},
 '바이닐수집': {'type': 'INTP', 's

In [7]:
hobby_list = ["등산", "무에타이", "클라이밍", "수상스키", "스키", 
            "배드민턴", "요가", "헬스", "다꾸", "유튜버되기", "풍경사진찍기",
            "하늘사진찍기", "카톡이모티콘만들기", "블로그쓰기", "홈카페",
            "연극보기", "전시회구경", "노래녹음하기", "바이닐수집", "과일청담그기"]

In [27]:
dataset = {}

In [28]:
for idx, hobby in enumerate(data_result):
    if hobby in hobby_list:
        # dataset[idx] = {'hobby':hobby, 'score':data_result[hobby]['score']}
        dataset[hobby] = data_result[hobby]['score']

In [29]:
dataset

{'등산': [46, 0, 100, 50],
 '무에타이': [0, 100, 100, 48],
 '클라이밍': [46, 37, 100, 51],
 '수상스키': [46, 68, 100, 54],
 '스키': [55, 0, 60, 49],
 '배드민턴': [57, 0, 38, 46],
 '요가': [46, 0, 44, 47],
 '헬스': [42, 0, 54, 50],
 '다꾸': [0, 49, 30, 59],
 '유튜버되기': [59, 35, 44, 41],
 '풍경사진찍기': [47, 59, 31, 46],
 '하늘사진찍기': [52, 0, 100, 62],
 '카톡이모티콘만들기': [0, 77, 45, 47],
 '블로그쓰기': [49, 56, 43, 56],
 '홈카페': [55, 0, 100, 40],
 '연극보기': [0, 53, 42, 51],
 '전시회구경': [50, 50, 35, 43],
 '노래녹음하기': [58, 0, 100, 56],
 '바이닐수집': [0, 50, 36, 49],
 '과일청담그기': [51, 0, 100, 46]}

In [33]:
with open('dataset.json', 'w', encoding='utf-8') as f:
    json.dump(dataset, f, ensure_ascii=False)

In [5]:
hobby_enum = {}

In [8]:
for idx, hobby in enumerate(data_result):
    if hobby in hobby_list:
        hobby_enum[idx] = hobby

In [10]:
base_info = {
    "hobby_enum":hobby_enum,
    "list":[101,101,101,101]
}

In [12]:
with open('base_info.json', 'w', encoding='utf-8') as f:
    json.dump(base_info, f, indent='\t', ensure_ascii=False)