## Import

In [1]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting transformers<5.0.0,>=4.6.0 (from sentence_transformers)
  Downloading transformers-4.33.2-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m52.8 MB/s[0m eta [36m0:00:00[0m
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface-hub>=0.4.0 (from sentence_transformers)
  Downloading huggingface_hub-0.17.2-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB

In [2]:
import re
import pandas as pd
import numpy as np
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

## Random Seed

In [3]:
SEED = 0

np.random.seed(SEED)
random.seed(SEED)

## Load Data

In [4]:
from google.colab import drive
drive.mount('/content/drive/')

import os
os.chdir('./drive/MyDrive/Storage/Github/hyuckjinkim/data-scientist-competitions/Dacon/18_뉴스기사레이블복구/')

Mounted at /content/drive/


In [5]:
df = pd.read_csv('./data/news.csv')
df.head()

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...


In [6]:
# 제목 + 내용
df['text'] = df['title'] + ' : ' + df['contents']
df.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row : MADR...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city : In Bosnia,..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores : Macrom...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


## Pre-processing

In [7]:
def preprocess_text(text):
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)

    # 멘션 제거
    text = re.sub(r'@\w+', '', text)

    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')

    # 공백 및 특수문자 제거
    text = re.sub(r'\s+', ' ', text).strip()

    # 숫자 제거
    text = re.sub(r'\d+', '', text)

    return text.lower()

In [8]:
df['processed_text'] = df['text'].apply(preprocess_text)

## Feature Extraction

In [19]:
%%time
# 2m

# Sentence BERT 모델 로드
model_names = ['paraphrase-distilroberta-base-v1','paraphrase-MiniLM-L6-v2','all-mpnet-base-v2']
model_name = model_names[1]
model = SentenceTransformer(model_name,device='cuda')

# 텍스트 feature 추출
sentence_embeddings = model.encode(df['text'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)

Downloading (…)001fa/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)bb8001fa/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)001fa/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b8001fa/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

CPU times: user 56.1 s, sys: 703 ms, total: 56.8 s
Wall time: 54.1 s


## Clustering

In [20]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=SEED)

df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)



## Post-processing

### Entertainment: 0 -> 1

In [21]:
# i=0
# for text in df[df['kmeans_cluster'] == i]['text'].head():
#     print(text,'\n')

In [22]:
df[df['kmeans_cluster'] == 0]['text'].head(3)

1     Bruce Lee statue for divided city : In Bosnia,...
10    Harry #39;s argy-bargy : PRINCE Charles has as...
16    Fischer's Fiancee: Marriage Plans Genuine (AP)...
Name: text, dtype: object

In [23]:
print(df['text'][1])
print(df['text'][10])
print(df['text'][16])

Bruce Lee statue for divided city : In Bosnia, where one man #39;s hero is often another man #39;s villain, some citizens have decided to honour one whom Serbs, Croats and Muslims can all look up to - the kung fu great Bruce Lee.
Harry #39;s argy-bargy : PRINCE Charles has asked Scotland Yard for an in-depth report on his son Harry #39;s trip to Argentina after reports of excessive drinking and a kidnap plot.
Fischer's Fiancee: Marriage Plans Genuine (AP) : AP - Former chess champion Bobby Fischer's announcement thathe is engaged to a Japanese woman could win him sympathy among Japanese officials and help him avoid deportation to the United States, his fiancee and one of his supporters said Tuesday.


### Sports: 1 -> 3

In [24]:
df[df['kmeans_cluster'] == 1]['text'].head(3)

2     Only Lovers Left Alive's Tilda Swinton Talks A...
25    Be on TOP : //www.huffingtonpost.com/entry/be-...
28    Cate Blanchett Set To Star As Lucille Ball In ...
Name: text, dtype: object

In [25]:
print(df['text'][0])
print(df['text'][13])
print(df['text'][22])

Spanish coach facing action in race row : MADRID (AFP) - Spanish national team coach Luis Aragones faces a formal investigation after Spain #39;s Football Federation decided to open disciplinary proceedings over racist comments about Thierry Henry of France and Arsenal.
GAME DAY PREVIEW Game time: 6:00 PM : CHARLOTTE, North Carolina (Ticker) -- The Detroit Shock face a critical road test Saturday when they take on the Charlotte Sting at Charlotte Coliseum.
College Basketball: Georgia Tech, UConn Win : ATLANTA (Sports Network) - BJ Elder poured in a game-high 27 points to lead fourth-ranked Georgia Tech to a convincing 99-68 win over Michigan in the ACC-Big Ten Challenge at Alexander Memorial Coliseum.


### Politics: 2 -> 2

In [None]:
df[df['kmeans_cluster'] == 2]['text'].head(3)

2    Only Lovers Left Alive's Tilda Swinton Talks A...
6    Time to Talk Baseball : It's time to talk abou...
7    Bump Stock Maker Resumes Sales One Month After...
Name: text, dtype: object

In [None]:
print(df['text'][2])
print(df['text'][6])
print(df['text'][7])

Only Lovers Left Alive's Tilda Swinton Talks About Almost Quitting Acting and Yasmine Hamdan Performs 'Hal' Live In NYC   (HuffPo Exclusive Videos) authors : Yasmine Hamdan performs 'Hal' which she also sings in the film during a scene when two world-weary vampires begin to heal and find a way to continue living as they remember the power and mystery of creation itself.
Time to Talk Baseball : It's time to talk about the serious risks and potential benefits of building an expensive ballpark in Washington.
Bump Stock Maker Resumes Sales One Month After Las Vegas Mass Shooting authors : Move along nothing to see here.


### Business: 3 -> 0

In [None]:
df[df['kmeans_cluster'] == 3]['text'].head(3)

11    Kerry rolls out tax-cut plan for middle class ...
20    Deere's Color Is Green : With big tractors, bi...
50    UN Predicts Boom In Robot Labor : The use of r...
Name: text, dtype: object

In [None]:
print(df['text'][11])
print(df['text'][20])
print(df['text'][50])

Kerry rolls out tax-cut plan for middle class : After two weeks of focusing on Iraq, Democratic presidential challenger John Kerry turned his emphasis to the economy Saturday, delivering what he called a plan for  quot;middle-class families.
Deere's Color Is Green : With big tractors, big sales, and big earnings, Deere's hoeing a profitable row.
UN Predicts Boom In Robot Labor : The use of robots around the home to mow lawns, vacuum floors and manage other chores is set to surge sevenfold by 2007 as more consumers snap up smart machines, the United Nations said.


### Tech: 4 -> 4

In [None]:
df[df['kmeans_cluster'] == 4]['text'].head(3)

3    Macromedia contributes to eBay Stores : Macrom...
4    Qualcomm plans to phone it in on cellular repa...
5    Thomson to Back Both Blu-ray and HD-DVD : Comp...
Name: text, dtype: object

In [None]:
print(df['text'][3])
print(df['text'][4])
print(df['text'][5])

Macromedia contributes to eBay Stores : Macromedia has announced a special version of its Contribute website editing application designed to simplify the creation and customisation of eBay Stores.
Qualcomm plans to phone it in on cellular repairs : Over-the-air fixes for cell phones comes to Qualcomm's CDMA.
Thomson to Back Both Blu-ray and HD-DVD : Company, one of the core backers of Blu-ray, will also support its rival format.


### World: 5 -> 5

In [None]:
df[df['kmeans_cluster'] == 5]['text'].head(3)

18    A Fair Way to Choose Candidates for Republican...
25    Be on TOP : //www.huffingtonpost.com/entry/be-...
33    Memo To EPA Chief Pruitt : //www.huffingtonpos...
Name: text, dtype: object

In [None]:
print(df['text'][18])
print(df['text'][25])
print(df['text'][33])

A Fair Way to Choose Candidates for Republican Debate : //www.huffingtonpost.com/entry/a-fair-way-to-choose-cand_b_7922194.html short_description
Be on TOP : //www.huffingtonpost.com/entry/be-on-top-amazon-best-sel_b_12508618.html short_description
Memo To EPA Chief Pruitt : //www.huffingtonpost.com/entry/memo-to-epa-chief-pruitt-lets-end-subsidies-for-fossil_us_59ee9567e4b0b8a51417bcc6 short_description


### Mapping

In [None]:
mapping_dict = {
    0: 1,
    1: 3,
    2: 2,
    3: 0,
    4: 4,
    5: 5
}

In [None]:
df['mapping'] = df['kmeans_cluster'].apply(lambda x: mapping_dict[x])

## Submission

In [None]:
sample = pd.read_csv('sample_submission.csv')

In [None]:
sample['category'] = df['mapping'].values
sample['category'].head()

0    3
1    1
2    2
3    4
4    4
Name: category, dtype: int64

In [None]:
sample.to_csv('baseline_submit.csv', index=False)