In [1]:
# !pip install tensorflow tensorflow_hub tensorflow_text
# !pip install sentence_transformers
# !pip install torch
# !pip install 'top2vec[sentence_transformers]'
# !pip install transformers

In [2]:
# from google.colab import drive
# drive.mount('/content/drive/')

# import os
# os.chdir('./drive/MyDrive/Storage/Github/hyuckjinkim/data-scientist-competitions/Dacon/18_뉴스기사레이블복구/')

In [3]:
import pandas as pd
import numpy as np
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
import re

def generate_combinations_with_spaces(input_list):
    # 결과를 저장할 리스트 초기화
    combinations = []

    # 입력 리스트의 길이
    n = len(input_list)

    # 2^n 가지의 조합 생성
    for i in range(2 ** n):
        combination = []
        for j in range(n):
            # i의 각 비트를 사용하여 띄어쓰기를 추가할지 여부를 결정
            if (i >> j) & 1:
                combination.append(' ')
            combination.append(input_list[j])
        # 조합 문자열을 생성하여 결과 리스트에 추가
        combinations.append(''.join(combination))

    return combinations

def replace_patterns(text,repl,input_list):
    for pattern in generate_combinations_with_spaces(input_list):
        text = re.sub(pattern,repl,text)
    return text

In [5]:
from bs4 import BeautifulSoup
def remove_html_tags_using_bs(text):
    soup = BeautifulSoup(text, 'html.parser')
    text = soup.get_text()
    # <로 시작하고 >로 끝나는 문장 제거
    text = re.sub(r'<[^>]*>', '', text)
    return text

In [6]:
def preprocess_text(text):
    # (1) multi space -> single space
    while True:
        new_text = re.sub('  ', ' ', text)
        if new_text!=text:
            text = new_text
        else:
            break

    # (2) html tag 수정
    text = remove_html_tags_using_bs(text)

    # (3) tag 추가수정
    inputs = [
        [r'#39',r';s'],      # (1) 어퍼스트로피 수정 -> #39;s : '(어퍼스트로피)
        [r'u00a0',r'ufeff'], # (2) 논 브레이크 스페이스, 제로 너비 논 브레이크 스페이스
        [r'u00a0'],          # 따로쓰일수도 있으므로 따로도 추가해줌
        [r'ufeff'],
    ]
    for input in inputs:
        text = replace_patterns(text,repl='',input_list=input)

    # (4) 특별케이스 제거
    text = re.sub('quot;','',text)  # 큰따옴표 제거
    text = re.sub(' // ','',text)   # (1~3)에서 제거 후 남은 특수문자(//) 제거
    text = re.sub('\\$','$',text)   # 달러 앞에 붙는 특수문자
    text = re.sub('#151;','-',text) # em dash (-)

    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # # 숫자 제거
    # text = re.sub(r'\d+', '', text)

    # 소문자 변경
    text = text.lower()

    # description 제거
    text = re.sub(r' // short_description','',text)

    return text

In [7]:
df = pd.read_csv('./data/news.csv')
df['preprocessed_text'] = (df['title']+'. '+df['contents']).progress_apply(preprocess_text)
# df['preprocessed_text'] = df.title.apply(preprocess_text)
df.head()

100%|██████████| 60000/60000 [00:06<00:00, 8937.25it/s]


Unnamed: 0,id,title,contents,preprocessed_text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action in race row. madri...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue for divided city. in bosnia, ..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,only lovers left alive's tilda swinton talks a...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes to ebay stores. macrome...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans to phone it in on cellular repa...


In [8]:
# df[df.preprocessed_text.str.contains('#')].preprocessed_text.values

In [9]:
# import random
# samples = random.sample(df.preprocessed_text.tolist(),5)
# samples

In [None]:
from top2vec import Top2Vec
# from transformers import AutoTokenizer
# https://github.com/ddangelov/Top2Vec/blob/master/top2vec/Top2Vec.py

In [None]:
%%time
# 2m

embedding_models = ['universal-sentence-encoder','universal-sentence-encoder-large','all-MiniLM-L6-v2']
speeds = ['learn','deep-learn']

embedding_model = embedding_models[2]
speed = speeds[0]

tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')

model = Top2Vec(
    df.preprocessed_text.tolist(),
    workers=4,
    embedding_model=embedding_model,
    tokenizer=tokenizer,
    speed=speed,
)

model_path = './models/top2vec_title&contents_model:{}_speed:{}.t2v'.format(embedding_model,speed)
model.save(model_path)
# model = Top2Vec.load(model_path)

In [None]:
keywords = ['business','entertainment','politics','sports','tech','world']
keyword = keywords[0]

In [None]:
# words, word_scores = model.similar_words(keywords=[keyword], keywords_neg=[], num_words=20)
# words

# model.search_topics(keywords=[keyword],num_topics=0,)

In [None]:
model_df = df.copy()
for keyword in tqdm(keywords):
    docs, scores, docs_idx = model.search_documents_by_keywords(
        keywords=[keyword],
        num_docs=len(df),
    )
    score_df = pd.DataFrame({'document':docs,'score':scores},index=docs_idx).sort_index()
    model_df[f'score_{keyword}'] = score_df['score']

In [None]:
score_cols = [col for col in model_df.columns if col.find('score_')>=0]
model_df['category'] = model_df[score_cols].apply(lambda x: keywords[np.argmax(x)],axis=1)

In [None]:
model_df['category'].value_counts()

In [None]:
submit = pd.read_csv('./data/sample_submission.csv')
mapping_dict = {keyword.capitalize():i for i,keyword in enumerate(keywords)}
submit['category'] = [mapping_dict[category] for category in model_df.category.str.capitalize()]
submit.to_csv('./out/submit_1.csv',index=False)

In [None]:
display(model_df.category.str.capitalize().value_counts())
display(submit.category.value_counts())

In [None]:
submit.head()