# 데이터 로드

In [1]:
import pandas as pd
import re
import numpy as np

data = pd.read_csv('data/news.csv')

In [None]:
data['intersection'] = data.apply(lambda row:" ".join(set(row['title'].split()).intersection(set(row['contents'].split()))), axis=1)

In [2]:
category = {'category': [0,1,2,3,4,5], 'info':['Business','Entertainment','Politics','Sports','Tech','World']}
category_data = pd.DataFrame(category)
print(category_data)

   category           info
0         0       Business
1         1  Entertainment
2         2       Politics
3         3         Sports
4         4           Tech
5         5          World


# 전처리

In [3]:
data['text'] = data['title'] + ':' + data['contents']
data.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    
    # text 소문자화
    text = text.lower()
    
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)
    
    # 멘션 제거
    text = re.sub(r'@\w+', '', text)
    
    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # 날짜 및 기자 이름 등 불필요한 정보가 포함된 문자열 패턴 삭제
    text = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b \d{1,2}, \d{4}', '', text)
    
    # 숫자와 일부 특수문자('.', ',', '!')만 남기고 모두 삭제 
    # (문장 구조 정보 유지를 위해 일부 구두점은 유지하되, 그 외 불필요한 문자는 삭제)
    text = re.sub(r'[^\w\s.,!]', '',  text)

    # Stopwords removal - NLTK library 사용 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if not word in stop_words]

    return ' '.join(filtered_text).lower()

data['text'] = data['text'].apply(preprocess_text)
data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\이동현\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\이동현\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race rowmadrid afp...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided cityin bosnia , one m..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alives tilda swinton talks almost ...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay storesmacromedia a...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairsovertheai...


# Baseline

In [None]:
import random
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Sentence BERT 모델 로드
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 텍스트 feature 추출
sentence_embeddings = model.encode(data['text'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)
df_embeddings

In [None]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=42)

data['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)
data.head()

In [None]:
category_data

In [None]:
data[data['kmeans_cluster'] == 0]['text'].head(5) # world

In [None]:
print(data['text'][1])
print(data['text'][10])
print(data['text'][29])
print(data['text'][34])
print(data['text'][37])

In [None]:
data[data['kmeans_cluster'] == 1]['text'].head(5) # sport

In [None]:
print(data['text'][0])
print(data['text'][13])
print(data['text'][21])
print(data['text'][22])
print(data['text'][24])

In [None]:
data[data['kmeans_cluster'] == 2]['text'].head(5) # Tech

In [None]:
print(data['text'][3])
print(data['text'][4])
print(data['text'][5])
print(data['text'][23])
print(data['text'][31])

In [None]:
data[data['kmeans_cluster'] == 3]['text'].head(5) # entertainment

In [None]:
print(data['text'][2])
print(data['text'][6])
print(data['text'][8])
print(data['text'][9])
print(data['text'][11])

In [None]:
data[data['kmeans_cluster'] == 4]['text'].head(5) # Business

In [None]:
print(data['text'][7])
print(data['text'][19])
print(data['text'][20])
print(data['text'][27])
print(data['text'][51])

In [None]:
data[data['kmeans_cluster'] == 5]['text'].head(5) # politics

In [None]:
print(data['text'][18])
print(data['text'][25])
print(data['text'][33])

In [None]:
data['kmeans_cluster'].value_counts()

In [None]:
mapping_dict = {
    0: 5,
    1: 3,
    2: 4,
    3: 1,
    4: 0,
    5: 2
}

data['mapping'] = data['kmeans_cluster'].apply(lambda x : mapping_dict[x])

In [None]:
sample = pd.read_csv('data/sample_submission.csv')
sample['category'] = data['mapping'].values
sample

In [None]:
sample.to_csv('submit_first.csv', index = False)

# sentence-transformers/all-mpnet-base-v2

In [24]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

def call_model(df, model_name):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SentenceTransformer(model_name).to(device)
    
    bert_embeddings = model.encode(df['text'].tolist())
    sentence_embeddings = pd.DataFrame(bert_embeddings)
    kmeans = KMeans(n_clusters=6, random_state=42)
    df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)
    return df

In [25]:
all_data = call_model(data, "sentence-transformers/all-mpnet-base-v2")
print(all_data.head())

  super()._check_params_vs_input(X, default_n_init=10)


           id                                              title  \
0  NEWS_00000            Spanish coach facing action in race row   
1  NEWS_00001                  Bruce Lee statue for divided city   
2  NEWS_00002  Only Lovers Left Alive's Tilda Swinton Talks A...   
3  NEWS_00003              Macromedia contributes to eBay Stores   
4  NEWS_00004  Qualcomm plans to phone it in on cellular repairs   

                                            contents  \
0  MADRID (AFP) - Spanish national team coach Lui...   
1  In Bosnia, where one man #39;s hero is often a...   
2  Yasmine Hamdan performs 'Hal' which she also s...   
3  Macromedia has announced a special version of ...   
4  Over-the-air fixes for cell phones comes to Qu...   

                                                text  kmeans_cluster  
0  spanish coach facing action race rowmadrid afp...               0  
1  bruce lee statue divided cityin bosnia , one m...               3  
2  lovers left alives tilda swinton talks

In [26]:
def check_category(data, val, count):
    return print(data[data['kmeans_cluster']==val]['text'].head(count))

In [27]:
check_category(all_data, 0, 5) # sports

0     spanish coach facing action race rowmadrid afp...
6     time talk baseballits time talk serious risks ...
13    game day preview game time 600 pmcharlotte , n...
16    fischers fiancee marriage plans genuine apap f...
21    blake leeper wants first american paralympian ...
Name: text, dtype: object


In [28]:
check_category(all_data, 1, 5) # politics

8     obama marks anniversary 911 attacks moment sil...
9     republican congressman says trump apologize ob...
11    kerry rolls taxcut plan middle classafter two ...
12    read live updates south carolina democratic pr...
14    obama administration helps wall street crimina...
Name: text, dtype: object


In [29]:
check_category(all_data, 2, 5) # Business

7     bump stock maker resumes sales one month las v...
19    congress spikes handout private equity authors...
20    deeres color greenwith big tractors , big sale...
27    kmartsears merger price , qualityaverage custo...
37    deep impact space probe aims slam comet reuter...
Name: text, dtype: object


In [30]:
check_category(all_data, 3, 5)  # entertain

1     bruce lee statue divided cityin bosnia , one m...
2     lovers left alives tilda swinton talks almost ...
10    harry argybargyprince charles asked scotland y...
25                                top short_description
28    cate blanchett set star lucille ball new biopi...
Name: text, dtype: object


In [31]:
check_category(all_data, 4, 5) # world

29    israel kills 3 palestinians big gaza incursion...
34    folly sole superpower writ small authorsthink ...
51    oil falls 49 nigeria ceasefirelondon reuters o...
56    sadr aide denies entering iraqi police najaf s...
57    former nazi guard loses canadian court ruling ...
Name: text, dtype: object


In [32]:
check_category(all_data, 5, 5) # tech

3     macromedia contributes ebay storesmacromedia a...
4     qualcomm plans phone cellular repairsovertheai...
5     thomson back bluray hddvdcompany , one core ba...
23    ftc files first lawsuit spyware concernsthe fe...
31    sony psp draws crowds lines first day reutersr...
Name: text, dtype: object


In [33]:
mapping_dict = {
    0: 3,
    1: 2,
    2: 0,
    3: 1,
    4: 5,
    5: 4
}

all_data['mapping'] = all_data['kmeans_cluster'].apply(lambda x : mapping_dict[x])

In [34]:
all_data.head(5)

Unnamed: 0,id,title,contents,text,kmeans_cluster,mapping
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race rowmadrid afp...,0,3
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided cityin bosnia , one m...",3,1
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alives tilda swinton talks almost ...,3,1
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay storesmacromedia a...,5,4
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairsovertheai...,5,4


In [35]:
sample = pd.read_csv('data/sample_submission.csv')
sample['category'] = all_data['mapping'].values
sample.head()

Unnamed: 0,id,category
0,NEWS_00000,3
1,NEWS_00001,1
2,NEWS_00002,1
3,NEWS_00003,4
4,NEWS_00004,4


In [36]:
sample.to_csv('sumbit_third.csv', index = False)