# 데이터 로드

In [15]:
import pandas as pd
import re
import numpy as np

data = pd.read_csv('data/news.csv')
data

Unnamed: 0,id,title,contents
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...
...,...,...,...
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...


In [11]:
data['intersection'] = data.apply(lambda row:" ".join(set(row['title'].split()).intersection(set(row['contents'].split()))), axis=1)
data

Unnamed: 0,id,title,contents,intersection
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,coach Spanish
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...",Bruce
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Yasmine and Hamdan 'Hal'
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,to Macromedia eBay
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,to
...,...,...,...,...
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...,
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...,of
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...,a Culpepper on
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...,


In [7]:
category = {'category': [0,1,2,3,4,5], 'info':['Business','Entertainment','Politics','Sports','Tech','World']}
category_data = pd.DataFrame(category)
print(category_data)

   category           info
0         0       Business
1         1  Entertainment
2         2       Politics
3         3         Sports
4         4           Tech
5         5          World


# 전처리

In [18]:
data['text'] = data['title'] + ':' + data['contents']
data.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


In [29]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    
    # text 소문자화
    text = text.lower()
    
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)
    
    # 멘션 제거
    text = re.sub(r'@\w+', '', text)
    
    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # 날짜 및 기자 이름 등 불필요한 정보가 포함된 문자열 패턴 삭제
    text = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b \d{1,2}, \d{4}', '', text)
    
    # 숫자와 일부 특수문자('.', ',', '!')만 남기고 모두 삭제 
    # (문장 구조 정보 유지를 위해 일부 구두점은 유지하되, 그 외 불필요한 문자는 삭제)
    text = re.sub(r'[^\w\s.,!]', '',  text)

    # Stopwords removal - NLTK library 사용 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if not word in stop_words]

    return ' '.join(filtered_text).lower()

data['text'] = data['text'].apply(preprocess_text)
data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\이동현\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\이동현\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race rowmadrid afp...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided cityin bosnia , one m..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alives tilda swinton talks almost ...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay storesmacromedia a...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairsovertheai...


# Baseline

In [38]:
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Sentence BERT 모델 로드
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 텍스트 feature 추출
sentence_embeddings = model.encode(data['text'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)

In [40]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=42)

data['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)
data.head()

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,id,title,contents,text,kmeans_cluster
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race rowmadrid afp...,1
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided cityin bosnia , one m...",0
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alives tilda swinton talks almost ...,3
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay storesmacromedia a...,2
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairsovertheai...,2


In [41]:
category_data

Unnamed: 0,category,info
0,0,Business
1,1,Entertainment
2,2,Politics
3,3,Sports
4,4,Tech
5,5,World


In [57]:
data[data['kmeans_cluster'] == 0]['text'].head(5) # world

1     bruce lee statue divided cityin bosnia , one m...
10    harry argybargyprince charles asked scotland y...
29    israel kills 3 palestinians big gaza incursion...
34    folly sole superpower writ small authorsthink ...
37    deep impact space probe aims slam comet reuter...
Name: text, dtype: object

In [58]:
print(data['text'][1])
print(data['text'][10])
print(data['text'][29])
print(data['text'][34])
print(data['text'][37])

bruce lee statue divided cityin bosnia , one man hero often another man villain , citizens decided honour one serbs , croats muslims look kung fu great bruce lee .
harry argybargyprince charles asked scotland yard indepth report son harry trip argentina reports excessive drinking kidnap plot .
israel kills 3 palestinians big gaza incursion reutersreuters israeli forces killed threepalestinians , including two teenagers , wednesday afterstorming northern gaza strip third time asmany months quell palestinian rocket fire israel .
folly sole superpower writ small authorsthink little imperial folly heres backstory . years invading iraq disbanding saddam husseins military u.s. sunk 25 billion standing new iraqi army .
deep impact space probe aims slam comet reutersreuters astronomers plan slam anarmchairsized impactor comet tempel 1 see whatsinside possibly help future scientists determine tokeep space rocks colliding earth .


In [59]:
data[data['kmeans_cluster'] == 1]['text'].head(5) # sport

0     spanish coach facing action race rowmadrid afp...
13    game day preview game time 600 pmcharlotte , n...
21    blake leeper wants first american paralympian ...
22    college basketball georgia tech , uconn winatl...
24    kentucky fan gets national champs tattoo . let...
Name: text, dtype: object

In [60]:
print(data['text'][0])
print(data['text'][13])
print(data['text'][21])
print(data['text'][22])
print(data['text'][24])

spanish coach facing action race rowmadrid afp spanish national team coach luis aragones faces formal investigation spain football federation decided open disciplinary proceedings racist comments thierry henry france arsenal .
game day preview game time 600 pmcharlotte , north carolina ticker detroit shock face critical road test saturday take charlotte sting charlotte coliseum .
blake leeper wants first american paralympian olympics authorsblake leeper may training rio 2016 olympic games found time stop samsung smart lounge talk goals technology help people like compete high level .
college basketball georgia tech , uconn winatlanta sports network bj elder poured gamehigh 27 points lead fourthranked georgia tech convincing 9968 win michigan accbig ten challenge alexander memorial coliseum .
kentucky fan gets national champs tattoo . lets hope happens . authorsthats real confidence bro .


In [61]:
data[data['kmeans_cluster'] == 2]['text'].head(5) # Tech

3     macromedia contributes ebay storesmacromedia a...
4     qualcomm plans phone cellular repairsovertheai...
5     thomson back bluray hddvdcompany , one core ba...
23    ftc files first lawsuit spyware concernsthe fe...
31    sony psp draws crowds lines first day reutersr...
Name: text, dtype: object

In [62]:
print(data['text'][3])
print(data['text'][4])
print(data['text'][5])
print(data['text'][23])
print(data['text'][31])

macromedia contributes ebay storesmacromedia announced special version contribute website editing application designed simplify creation customisation ebay stores .
qualcomm plans phone cellular repairsovertheair fixes cell phones comes qualcomms cdma .
thomson back bluray hddvdcompany , one core backers bluray , also support rival format .
ftc files first lawsuit spyware concernsthe federal trade commission formally announced yesterday first assault spyware bits computer code surreptitiously install computers internet users
sony psp draws crowds lines first day reutersreuters game fans stood lines chillytokyo night among first world get theirhands sony corp.s playstation portable , consumerelectronics firms first handheld game machine .


In [63]:
data[data['kmeans_cluster'] == 3]['text'].head(5) # entertainment

2     lovers left alives tilda swinton talks almost ...
6     time talk baseballits time talk serious risks ...
8     obama marks anniversary 911 attacks moment sil...
9     republican congressman says trump apologize ob...
11    kerry rolls taxcut plan middle classafter two ...
Name: text, dtype: object

In [64]:
print(data['text'][2])
print(data['text'][6])
print(data['text'][8])
print(data['text'][9])
print(data['text'][11])

lovers left alives tilda swinton talks almost quitting acting yasmine hamdan performs hal live nyc huffpo exclusive videos authorsyasmine hamdan performs hal also sings film scene two worldweary vampires begin heal find way continue living remember power mystery creation .
time talk baseballits time talk serious risks potential benefits building expensive ballpark washington .
obama marks anniversary 911 attacks moment silence authorswe stand strong ever .
republican congressman says trump apologize obama uk authorsbest hold breath one .
kerry rolls taxcut plan middle classafter two weeks focusing iraq , democratic presidential challenger john kerry turned emphasis economy saturday , delivering called plan quotmiddleclass families .


In [68]:
data[data['kmeans_cluster'] == 4]['text'].head(5) # Business

7     bump stock maker resumes sales one month las v...
19    congress spikes handout private equity authors...
20    deeres color greenwith big tractors , big sale...
27    kmartsears merger price , qualityaverage custo...
51    oil falls 49 nigeria ceasefirelondon reuters o...
Name: text, dtype: object

In [66]:
print(data['text'][7])
print(data['text'][19])
print(data['text'][20])
print(data['text'][27])
print(data['text'][51])

bump stock maker resumes sales one month las vegas mass shooting authorsmove along nothing see .
congress spikes handout private equity authorsa wall street firms almost big .
deeres color greenwith big tractors , big sales , big earnings , deeres hoeing profitable row .
kmartsears merger price , qualityaverage customers know thing ministers high finance understand price . shoppers thursday billings sears store eager find proposed
oil falls 49 nigeria ceasefirelondon reuters oil prices dropped record highs 50 barrel wednesday u.s. government reported surprise increase crude stocks rebels nigerias oilrich delta region agreed ceasefire .


In [67]:
data[data['kmeans_cluster'] == 5]['text'].head(5) # politics

18    fair way choose candidates republican debate s...
25                                top short_description
33              memo epa chief pruitt short_description
68                     satire save us short_description
76                              watch short_description
Name: text, dtype: object

In [53]:
print(data['text'][18])
print(data['text'][25])
print(data['text'][33])

fair way choose candidates republican debate short_description
top short_description
memo epa chief pruitt short_description


In [70]:
data['kmeans_cluster'].value_counts()

kmeans_cluster
3    14927
1    12011
2    10261
4    10138
0     9940
5     2723
Name: count, dtype: int64

In [71]:
mapping_dict = {
    0: 5,
    1: 3,
    2: 4,
    3: 1,
    4: 0,
    5: 2
}

data['mapping'] = data['kmeans_cluster'].apply(lambda x : mapping_dict[x])

In [72]:
sample = pd.read_csv('data/sample_submission.csv')
sample['category'] = data['mapping'].values
sample

Unnamed: 0,id,category
0,NEWS_00000,3
1,NEWS_00001,5
2,NEWS_00002,1
3,NEWS_00003,4
4,NEWS_00004,4
...,...,...
59995,NEWS_59995,3
59996,NEWS_59996,0
59997,NEWS_59997,3
59998,NEWS_59998,0


In [73]:
sample.to_csv('submit_first.csv', index = False)

# 사전학습 모델

In [12]:
from lbl2vec import Lbl2Vec

Downloading (…)e9125/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)7e55de9125/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)55de9125/config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)125/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [36]:
model = Lbl2Vec(keywords_list=category_data['info'].tolist())

ValueError: keywords_list has to be an iterable list of lists with descriptive keywords of type str

In [2]:
import torch
print(torch.cuda.is_available())

True
