# 데이터 로드

In [1]:
import pandas as pd
import re
import numpy as np

data = pd.read_csv('data/news.csv')

In [2]:
category = {'category': [0,1,2,3,4,5], 'info':['Business','Entertainment','Politics','Sports','Tech','World']}
category_data = pd.DataFrame(category)
print(category_data)

   category           info
0         0       Business
1         1  Entertainment
2         2       Politics
3         3         Sports
4         4           Tech
5         5          World


# 전처리

In [3]:
data['text'] = data['title'] + ':' + data['contents']
data.head()

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,Spanish coach facing action in race row:MADRID...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","Bruce Lee statue for divided city:In Bosnia, w..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,Only Lovers Left Alive's Tilda Swinton Talks A...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,Macromedia contributes to eBay Stores:Macromed...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,Qualcomm plans to phone it in on cellular repa...


In [4]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    
    # text 소문자화
    text = text.lower()
    
    # URL 제거
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # 해시태그 제거
    text = re.sub(r'#\w+', '', text)
    
    # 멘션 제거
    text = re.sub(r'@\w+', '', text)
    
    # 이모지 제거
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # 날짜 등 불필요한 정보가 포함된 문자열 패턴 삭제
    text = re.sub(r'\b(January|February|March|April|May|June|July|August|September|October|November|December)\b \d{1,2}, \d{4}', '', text)
    
    # (문장 구조 정보 유지를 위해 일부 구두점은 유지하되, 그 외 불필요한 문자는 삭제)
    text = re.sub(r'\s+', ' ', text).strip()
    text = re.sub(r'\d+', '', text)

    # Stopwords removal - NLTK library 사용 
    stop_words = set(stopwords.words('english'))
    word_tokens = word_tokenize(text) 
    filtered_text = [word for word in word_tokens if not word in stop_words]

    return ' '.join(filtered_text).lower()

data['text'] = data['text'].apply(preprocess_text)
data.head()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\이동현\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\이동현\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race row : madrid ...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided city : bosnia , one m..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alive 's tilda swinton talks almos...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay stores : macromedi...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairs : over-t...


# Baseline

In [16]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

# Sentence BERT 모델 로드
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# 텍스트 feature 추출
sentence_embeddings = model.encode(data['text'].tolist())

# 추출한 feature를 데이터프레임에 저장
df_embeddings = pd.DataFrame(sentence_embeddings)
df_embeddings

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.029478,0.092538,-0.063231,0.072295,0.028426,0.040515,0.028217,-0.017193,0.099561,0.012579,...,0.003577,0.017773,0.029132,-0.052808,-0.049190,0.015119,0.090982,0.009287,0.009245,-0.075322
1,0.010228,0.142878,-0.096870,0.036062,-0.002024,0.000765,-0.011156,-0.039506,-0.017979,0.021546,...,-0.058437,-0.015828,0.093026,0.030538,0.031982,0.054138,0.091621,-0.051607,-0.084787,0.022161
2,-0.036558,-0.057309,-0.059774,0.034143,-0.032851,0.057294,0.034626,-0.146016,0.061155,-0.067244,...,-0.001441,0.023937,-0.027462,0.060199,-0.005650,0.044188,0.053594,-0.005892,-0.046237,0.001704
3,-0.023528,0.012770,0.005389,-0.055611,0.102187,0.057971,-0.018113,0.065631,-0.078232,0.051220,...,-0.028415,0.000867,0.008708,-0.038017,0.096910,0.002110,0.069338,-0.071638,0.016557,0.047401
4,-0.085844,0.077641,0.079451,-0.042281,-0.141478,-0.019684,0.016619,0.015946,-0.026962,0.013273,...,-0.016531,-0.077603,0.072056,-0.020309,0.032724,0.056959,-0.012466,-0.028569,0.053045,0.048078
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,-0.043896,-0.083914,0.011465,-0.031866,0.042931,-0.030938,-0.003359,0.072316,0.022651,-0.032390,...,0.033196,-0.077506,-0.092380,-0.046390,-0.097507,0.005407,-0.026381,0.093459,0.019328,0.058970
59996,-0.078510,-0.031241,0.146126,0.033807,0.072683,-0.012684,-0.111765,0.056582,0.008110,-0.050332,...,0.064270,-0.020733,-0.017882,0.051984,-0.028565,-0.035596,-0.027935,-0.065030,-0.019693,0.085921
59997,-0.016317,-0.016149,-0.045557,-0.041666,0.012761,-0.012950,0.095677,0.049722,-0.039097,-0.023168,...,0.076929,0.012516,0.012921,-0.005027,-0.149752,0.030708,0.050657,0.088995,0.061050,0.066745
59998,-0.065774,-0.086164,-0.039205,0.038142,-0.028322,0.023129,-0.018781,0.063158,0.075045,-0.072849,...,-0.072066,0.007759,-0.101527,0.011502,0.005242,0.007203,0.071636,-0.068192,-0.072827,0.024034


In [None]:
# Sentence BERT 임베딩을 사용하여 군집화 수행
kmeans = KMeans(n_clusters=6, random_state=42)

data['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)
data.head()

In [None]:
category_data

In [None]:
data[data['kmeans_cluster'] == 0]['text'].head(5) # world

In [None]:
print(data['text'][1])
print(data['text'][10])
print(data['text'][29])
print(data['text'][34])
print(data['text'][37])

In [None]:
data[data['kmeans_cluster'] == 1]['text'].head(5) # sport

In [None]:
print(data['text'][0])
print(data['text'][13])
print(data['text'][21])
print(data['text'][22])
print(data['text'][24])

In [None]:
data[data['kmeans_cluster'] == 2]['text'].head(5) # Tech

In [None]:
print(data['text'][3])
print(data['text'][4])
print(data['text'][5])
print(data['text'][23])
print(data['text'][31])

In [None]:
data[data['kmeans_cluster'] == 3]['text'].head(5) # entertainment

In [None]:
print(data['text'][2])
print(data['text'][6])
print(data['text'][8])
print(data['text'][9])
print(data['text'][11])

In [None]:
data[data['kmeans_cluster'] == 4]['text'].head(5) # Business

In [None]:
print(data['text'][7])
print(data['text'][19])
print(data['text'][20])
print(data['text'][27])
print(data['text'][51])

In [None]:
data[data['kmeans_cluster'] == 5]['text'].head(5) # politics

In [None]:
print(data['text'][18])
print(data['text'][25])
print(data['text'][33])

In [None]:
data['kmeans_cluster'].value_counts()

In [None]:
mapping_dict = {
    0: 5,
    1: 3,
    2: 4,
    3: 1,
    4: 0,
    5: 2
}

data['mapping'] = data['kmeans_cluster'].apply(lambda x : mapping_dict[x])

In [None]:
sample = pd.read_csv('data/sample_submission.csv')
sample['category'] = data['mapping'].values
sample

In [None]:
sample.to_csv('submit_first.csv', index = False)

# all-mpnet-base-v2

In [291]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

def call_model(df, model_name):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SentenceTransformer(model_name).to(device)
    
    bert_embeddings = model.encode(df['text'].tolist())
    sentence_embeddings = pd.DataFrame(bert_embeddings)
    kmeans = KMeans(n_clusters=6, random_state=42)
    df['kmeans_cluster'] = kmeans.fit_predict(sentence_embeddings)
    return df


In [25]:
all_data = call_model(data, "sentence-transformers/all-mpnet-base-v2")
print(all_data.head())

  super()._check_params_vs_input(X, default_n_init=10)


           id                                              title  \
0  NEWS_00000            Spanish coach facing action in race row   
1  NEWS_00001                  Bruce Lee statue for divided city   
2  NEWS_00002  Only Lovers Left Alive's Tilda Swinton Talks A...   
3  NEWS_00003              Macromedia contributes to eBay Stores   
4  NEWS_00004  Qualcomm plans to phone it in on cellular repairs   

                                            contents  \
0  MADRID (AFP) - Spanish national team coach Lui...   
1  In Bosnia, where one man #39;s hero is often a...   
2  Yasmine Hamdan performs 'Hal' which she also s...   
3  Macromedia has announced a special version of ...   
4  Over-the-air fixes for cell phones comes to Qu...   

                                                text  kmeans_cluster  
0  spanish coach facing action race rowmadrid afp...               0  
1  bruce lee statue divided cityin bosnia , one m...               3  
2  lovers left alives tilda swinton talks

In [190]:
def check_category(data, val, count):
    return print(data[data['kmeans_cluster']==val]['text'].head(count))

In [27]:
check_category(all_data, 0, 5) # sports

0     spanish coach facing action race rowmadrid afp...
6     time talk baseballits time talk serious risks ...
13    game day preview game time 600 pmcharlotte , n...
16    fischers fiancee marriage plans genuine apap f...
21    blake leeper wants first american paralympian ...
Name: text, dtype: object


In [28]:
check_category(all_data, 1, 5) # politics

8     obama marks anniversary 911 attacks moment sil...
9     republican congressman says trump apologize ob...
11    kerry rolls taxcut plan middle classafter two ...
12    read live updates south carolina democratic pr...
14    obama administration helps wall street crimina...
Name: text, dtype: object


In [29]:
check_category(all_data, 2, 5) # Business

7     bump stock maker resumes sales one month las v...
19    congress spikes handout private equity authors...
20    deeres color greenwith big tractors , big sale...
27    kmartsears merger price , qualityaverage custo...
37    deep impact space probe aims slam comet reuter...
Name: text, dtype: object


In [30]:
check_category(all_data, 3, 5)  # entertain

1     bruce lee statue divided cityin bosnia , one m...
2     lovers left alives tilda swinton talks almost ...
10    harry argybargyprince charles asked scotland y...
25                                top short_description
28    cate blanchett set star lucille ball new biopi...
Name: text, dtype: object


In [31]:
check_category(all_data, 4, 5) # world

29    israel kills 3 palestinians big gaza incursion...
34    folly sole superpower writ small authorsthink ...
51    oil falls 49 nigeria ceasefirelondon reuters o...
56    sadr aide denies entering iraqi police najaf s...
57    former nazi guard loses canadian court ruling ...
Name: text, dtype: object


In [32]:
check_category(all_data, 5, 5) # tech

3     macromedia contributes ebay storesmacromedia a...
4     qualcomm plans phone cellular repairsovertheai...
5     thomson back bluray hddvdcompany , one core ba...
23    ftc files first lawsuit spyware concernsthe fe...
31    sony psp draws crowds lines first day reutersr...
Name: text, dtype: object


In [33]:
mapping_dict = {
    0: 3,
    1: 2,
    2: 0,
    3: 1,
    4: 5,
    5: 4
}

all_data['mapping'] = all_data['kmeans_cluster'].apply(lambda x : mapping_dict[x])

In [34]:
all_data.head(5)

Unnamed: 0,id,title,contents,text,kmeans_cluster,mapping
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race rowmadrid afp...,0,3
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided cityin bosnia , one m...",3,1
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alives tilda swinton talks almost ...,3,1
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay storesmacromedia a...,5,4
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairsovertheai...,5,4


In [35]:
sample = pd.read_csv('data/sample_submission.csv')
sample['category'] = all_data['mapping'].values
sample.head()

Unnamed: 0,id,category
0,NEWS_00000,3
1,NEWS_00001,1
2,NEWS_00002,1
3,NEWS_00003,4
4,NEWS_00004,4


In [36]:
sample.to_csv('sumbit_third.csv', index = False)

# LDA + Sentence_embedding

In [229]:
data

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race row : madrid ...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided city : bosnia , one m..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alive 's tilda swinton talks almos...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay stores : macromedi...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairs : over-t...
...,...,...,...,...
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...,"dolphins break , rip rams first win : ; ok. - ..."
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...,"steep drop , price oil rises : freefall oil pr..."
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...,pro football : culpepper puts show : say daunt...
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...,albertsons rebound : . grocer reports double-d...


In [230]:
from gensim import corpora, models
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import numpy as np

texts = data['text'].tolist()
tokenized_texts = [word_tokenize(text.lower()) for text in texts]
dictionary = corpora.Dictionary(tokenized_texts)
corpus = [dictionary.doc2bow(text) for text in tokenized_texts]
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 2),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 2),
  (32, 1),
  (33, 1)],
 [(2, 1),
  (3, 1),
  (4, 1),
  (5, 2),
  (12, 1),
  (34, 3),
  (35, 1),
  (36, 1),
  (37, 2),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 2),
  (48, 1),
  (49, 2),
  (50, 1),
  (51, 1),
  (52, 2),
  (53, 1),
  (54, 1),
  (55, 1)],
 [(0, 1),
  (1, 1),
  (3, 1),
  (4, 1),
  (56, 2),
  (57, 2),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 1),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 2),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 2),
  (80, 1),
  (81, 1),
  (82, 1),
  (8

In [231]:
lda_model = models.LdaModel(corpus, num_topics=6, id2word=dictionary)

In [232]:
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))

Topic: 0 
Words: 0.041*"." + 0.035*":" + 0.033*"," + 0.011*"new" + 0.010*"microsoft" + 0.010*"(" + 0.010*")" + 0.006*"software" + 0.006*"internet" + 0.006*"search"
Topic: 1 
Words: 0.040*"," + 0.036*":" + 0.034*"." + 0.031*"-" + 0.021*"(" + 0.021*")" + 0.013*"reuters" + 0.009*";" + 0.008*"oil" + 0.008*"ap"
Topic: 2 
Words: 0.028*"." + 0.027*":" + 0.020*"," + 0.010*"said" + 0.008*"minister" + 0.008*"president" + 0.008*"government" + 0.008*"'s" + 0.007*"court" + 0.007*";"
Topic: 3 
Words: 0.064*";" + 0.049*"," + 0.036*"." + 0.028*"&" + 0.024*":" + 0.024*"``" + 0.013*"lt" + 0.013*"gt" + 0.013*"$" + 0.011*"\"
Topic: 4 
Words: 0.066*":" + 0.054*"." + 0.049*"authors" + 0.028*"'s" + 0.016*"'" + 0.012*"trump" + 0.011*"short_description" + 0.011*"//" + 0.009*"n't" + 0.008*"?"
Topic: 5 
Words: 0.064*"," + 0.053*"." + 0.033*":" + 0.012*"-" + 0.009*";" + 0.008*"(" + 0.008*"'s" + 0.008*")" + 0.007*"new" + 0.007*"``"


In [233]:
most_probable_topic_labels = []
for doc_bow in corpus:
    topic_dist_for_doc = lda_model[doc_bow]
    most_probable_topic_label_for_doc = max(topic_dist_for_doc, key=lambda x: x[1])[0]
    most_probable_topic_labels.append(most_probable_topic_label_for_doc)

In [264]:
most_probable_topic_labels

min_val = min(most_probable_topic_labels)
max_val = max(most_probable_topic_labels)

In [267]:
normalized_data = [(x - min_val) / (max_val - min_val)/20 for x in most_probable_topic_labels]
normalized_data

[0.01,
 0.05,
 0.04,
 0.0,
 0.05,
 0.05,
 0.04,
 0.04,
 0.04,
 0.04,
 0.05,
 0.05,
 0.04,
 0.05,
 0.03,
 0.04,
 0.01,
 0.04,
 0.04,
 0.03,
 0.05,
 0.04,
 0.04,
 0.02,
 0.04,
 0.04,
 0.01,
 0.03,
 0.04,
 0.01,
 0.05,
 0.05,
 0.04,
 0.04,
 0.01,
 0.0,
 0.04,
 0.05,
 0.01,
 0.01,
 0.05,
 0.03,
 0.05,
 0.05,
 0.0,
 0.04,
 0.05,
 0.04,
 0.05,
 0.03,
 0.02,
 0.01,
 0.04,
 0.04,
 0.04,
 0.04,
 0.02,
 0.01,
 0.04,
 0.02,
 0.02,
 0.05,
 0.05,
 0.01,
 0.04,
 0.05,
 0.05,
 0.04,
 0.04,
 0.01,
 0.03,
 0.05,
 0.05,
 0.05,
 0.0,
 0.01,
 0.04,
 0.01,
 0.01,
 0.03,
 0.04,
 0.04,
 0.01,
 0.0,
 0.05,
 0.01,
 0.02,
 0.01,
 0.05,
 0.02,
 0.03,
 0.04,
 0.03,
 0.03,
 0.0,
 0.01,
 0.01,
 0.05,
 0.01,
 0.03,
 0.03,
 0.04,
 0.02,
 0.03,
 0.02,
 0.01,
 0.04,
 0.01,
 0.04,
 0.05,
 0.03,
 0.01,
 0.05,
 0.04,
 0.05,
 0.05,
 0.01,
 0.05,
 0.04,
 0.0,
 0.05,
 0.03,
 0.05,
 0.05,
 0.03,
 0.04,
 0.04,
 0.05,
 0.03,
 0.05,
 0.03,
 0.03,
 0.04,
 0.02,
 0.05,
 0.04,
 0.0,
 0.02,
 0.04,
 0.05,
 0.01,
 0.02,
 0.04,
 0.05,


In [268]:
most_probable_topic_labels_matrix = np.array(normalized_data).reshape(-1,1)

In [269]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = SentenceTransformer('all-MPNet-base-v2').to(device)
new_embeddings = model.encode(texts)
combined_embeddings = np.concatenate((new_embeddings , most_probable_topic_labels_matrix), axis=1)
combined_embeddings

array([[ 0.04544764,  0.00862091,  0.00897548, ..., -0.00701686,
        -0.02368966,  0.01      ],
       [ 0.0461752 ,  0.02777295, -0.01170711, ..., -0.00168982,
        -0.01523203,  0.05      ],
       [ 0.03158173, -0.04708987, -0.0128956 , ...,  0.06505194,
        -0.0079075 ,  0.04      ],
       ...,
       [-0.05354868,  0.0089177 ,  0.00150725, ...,  0.0483925 ,
        -0.03503702,  0.05      ],
       [-0.01351102,  0.0596181 , -0.00850887, ...,  0.02663661,
         0.00673103,  0.03      ],
       [ 0.01435386, -0.04575676, -0.02351659, ...,  0.04984248,
        -0.0051943 ,  0.05      ]])

In [270]:
pd.DataFrame(combined_embeddings)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,759,760,761,762,763,764,765,766,767,768
0,0.045448,0.008621,0.008975,0.039054,0.034634,0.020690,-0.027890,0.030100,-0.000780,-0.025788,...,-0.042200,-0.056218,0.046314,-0.013986,-0.034202,-0.050673,0.029158,-0.007017,-0.023690,0.01
1,0.046175,0.027773,-0.011707,0.065199,-0.016747,-0.000380,0.087836,-0.001527,0.048438,0.042053,...,-0.003931,0.003859,0.006886,-0.019959,-0.120030,0.008583,-0.013061,-0.001690,-0.015232,0.05
2,0.031582,-0.047090,-0.012896,0.007502,0.081088,0.042773,-0.026282,-0.019556,-0.062092,-0.003712,...,0.028198,0.047150,-0.020053,-0.008891,0.013817,0.038671,-0.009260,0.065052,-0.007907,0.04
3,0.051127,0.001993,-0.022106,-0.050080,0.028206,0.026564,0.058428,0.010197,-0.086738,-0.029771,...,0.021761,0.018043,0.026964,-0.040255,0.014643,-0.053347,-0.025159,-0.045340,-0.055678,0.00
4,-0.016526,0.041189,-0.039834,-0.047084,0.058258,0.017980,0.011497,-0.016223,-0.020592,0.021089,...,0.019420,0.060711,-0.003294,0.000740,0.022589,0.014541,-0.004746,-0.027510,-0.042008,0.05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59995,-0.073205,0.039385,-0.022117,-0.066947,0.019598,0.021803,0.013660,0.008548,-0.034330,-0.048934,...,0.001265,0.048010,0.004277,-0.039465,0.042730,-0.015914,0.022139,0.048495,-0.007493,0.05
59996,-0.040921,0.013346,0.011677,-0.000032,-0.017178,-0.035610,-0.044101,0.004061,-0.001040,-0.011732,...,0.025928,0.077180,0.037115,0.027061,-0.015913,0.017919,-0.047760,0.024044,-0.026003,0.01
59997,-0.053549,0.008918,0.001507,-0.018564,-0.047871,0.014830,-0.026932,0.064644,-0.036874,-0.050536,...,0.014480,-0.015786,-0.039254,-0.033040,0.090242,-0.016136,0.040464,0.048393,-0.035037,0.05
59998,-0.013511,0.059618,-0.008509,0.022231,0.037585,-0.027468,-0.047070,-0.019100,0.055828,-0.016444,...,0.000326,0.073062,0.045432,0.004879,-0.116536,0.016959,0.002189,0.026637,0.006731,0.03


In [271]:
kmeans = KMeans(n_clusters=6)
data['kmeans_cluster'] = kmeans.fit_predict(combined_embeddings)
data.head()

  super()._check_params_vs_input(X, default_n_init=10)


Unnamed: 0,id,title,contents,text,kmeans_cluster,mapping
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race row : madrid ...,0,5
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided city : bosnia , one m...",5,3
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alive 's tilda swinton talks almos...,2,1
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay stores : macromedi...,1,4
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairs : over-t...,1,3


In [250]:
def check_category(data, val, count):
    return print(data[data['kmeans_cluster']==val]['text'].head(count))

In [262]:
category_data

Unnamed: 0,category,info
0,0,Business
1,1,Entertainment
2,2,Politics
3,3,Sports
4,4,Tech
5,5,World


In [273]:
check_category(data,0,10) # sports

0     spanish coach facing action race row : madrid ...
6     time talk baseball : 's time talk serious risk...
13    game day preview game time : : pm : charlotte ...
16    fischer 's fiancee : marriage plans genuine ( ...
21    blake leeper wants first american paralympian ...
22    college basketball : georgia tech , uconn win ...
24    kentucky fan gets national champs tattoo . let...
26    doping case flawed , report finds : montreal s...
30    montgomerie beats woods s. korean skins ( ap )...
38    longhorns rip cowboys : cedric benson scores f...
Name: text, dtype: object


In [275]:
check_category(data,1,10) # tech

3     macromedia contributes ebay stores : macromedi...
4     qualcomm plans phone cellular repairs : over-t...
5     thomson back blu-ray hd-dvd : company , one co...
23    ftc files first lawsuit spyware concerns : fed...
31    sony psp draws crowds lines first day ( reuter...
41    photos macexpo : exhibitors ; including apple ...
50    un predicts boom robot labor : use robots arou...
52    amazon 's next kindle may better battery life ...
73    delphi , xm unveil handheld satellite radio re...
79    suit cities says microsoft overcharged : icros...
Name: text, dtype: object


In [276]:
check_category(data,2,10) # entertainment

2     lovers left alive 's tilda swinton talks almos...
10    harry ; argy-bargy : prince charles asked scot...
25                           top : // short_description
28    cate blanchett set star lucille ball new biopi...
40    v-i-c-t-o-r-y , missing tiles : missing key pi...
45    trouble broadcasting social world authors : to...
62    john waters ' women film society lincoln cente...
64    jon voight 'concerned ' daughter angelina joli...
68                satire save us : // short_description
76                         watch : // short_description
Name: text, dtype: object


In [283]:
print(data['text'][10])
print(data['text'][28])
print(data['text'][40])
print(data['text'][62])

harry ; argy-bargy : prince charles asked scotland yard in-depth report son harry ; trip argentina reports excessive drinking kidnap plot .
cate blanchett set star lucille ball new biopic authors : love news almost much love lucy .
v-i-c-t-o-r-y , missing tiles : missing key piece favorite board game ? web 's abundance board game sites might help .
john waters ' women film society lincoln center authors : one missed irony john waters retrospective walter reade theater across lincoln center 's plaza fashion week .


In [277]:
check_category(data,3,10) # politics

8     obama marks anniversary / attacks moment silen...
9     republican congressman says trump apologize ob...
11    kerry rolls tax-cut plan middle class : two we...
12    read live updates south carolina democratic pr...
14    obama administration helps wall street crimina...
15    's easy think spot gerrymandered map authors :...
17    parents school shooting victims decry 'moronic...
18    fair way choose candidates republican debate :...
32    sunday show hosts hit back trump administratio...
33         memo epa chief pruitt : // short_description
Name: text, dtype: object


In [279]:
check_category(data,4,10) # business

7      bump stock maker resumes sales one month las v...
19     congress spikes handout private equity authors...
20     deere 's color green : big tractors , big sale...
27     kmart-sears merger price , quality : average c...
51     oil falls \ $ nigeria cease-fire : london ( re...
70     abn amro profit rises , buoyed sale asia stake...
85     stocks open higher growth outlook : new york -...
98     producer prices . pct , energy drops ( reuters...
100    gm , daimlerchrysler develop hybrid engines : ...
102    lot managers want raise minimum wage authors :...
Name: text, dtype: object


In [280]:
print(data['text'][19])
print(data['text'][20])
print(data['text'][51])
print(data['text'][100])

congress spikes handout private equity authors : wall street firms almost big .
deere 's color green : big tractors , big sales , big earnings , deere 's hoeing profitable row .
oil falls \ $ nigeria cease-fire : london ( reuters ) - oil prices dropped record highs \ $ barrel wednesday u.s. government reported surprise increase crude stocks rebels nigeria 's oil-rich delta region agreed cease-fire .
gm , daimlerchrysler develop hybrid engines : general motors corp. daimlerchrysler ag teaming develop fuel-saving hybrid engines hopes cashing expanding market already dominated hybrid leaders toyota motor corp. honda motor co .


In [281]:
check_category(data,5,10) #world

1     bruce lee statue divided city : bosnia , one m...
29    israel kills palestinians big gaza incursion (...
34    folly sole superpower writ small authors : thi...
37    deep impact space probe aims slam comet ( reut...
49    bribery considered , halliburton notes suggest...
56    sadr ; aide denies entering iraqi police najaf...
57    former nazi guard loses canadian court ruling ...
59    afghanistan death toll : kandahar , afghanista...
60    portugal pm , cabinet submit resignations : li...
61    typhoon-like gusts hit japan ; injured : tokyo...
Name: text, dtype: object


In [282]:
print(data['text'][29])
print(data['text'][34])
print(data['text'][37])
print(data['text'][56])

israel kills palestinians big gaza incursion ( reuters ) : reuters - israeli forces killed three\palestinians , including two teenagers , wednesday after\storming northern gaza strip third time as\many months quell palestinian rocket fire israel .
folly sole superpower writ small authors : think little imperial folly -- 's backstory . years invading iraq disbanding saddam hussein 's military u.s. sunk $ billion standing new iraqi army .
deep impact space probe aims slam comet ( reuters ) : reuters - astronomers plan slam an\armchair-sized `` impactor '' comet tempel see what's\inside -- possibly help future scientists determine to\keep space rocks colliding earth .
sadr ; aide denies entering iraqi police najaf shrine : baghdad , aug. ( xinhuanet ) -- top aide shiite cleric moqtada al-sadr denied government announcement iraqi police entered imam ali shrine najaf friday , al-jazeera tv channel reported .


In [284]:
def submit_data(data, submit_name):
    mapping_dict = {
    0: 3,
    1: 4,
    2: 1,
    3: 2,
    4: 0,
    5: 5
}
    data['mapping'] = data['kmeans_cluster'].apply(lambda x : mapping_dict[x])
    sample = pd.read_csv('data/sample_submission.csv')
    sample['category'] = data['mapping'].values
    return sample.to_csv(submit_name, index = False)

submit_data(data, 'submit_sixth.csv')

# change cluster

In [6]:
data

Unnamed: 0,id,title,contents,text
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race row : madrid ...
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided city : bosnia , one m..."
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alive 's tilda swinton talks almos...
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay stores : macromedi...
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairs : over-t...
...,...,...,...,...
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...,"dolphins break , rip rams first win : ; ok. - ..."
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...,"steep drop , price oil rises : freefall oil pr..."
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...,pro football : culpepper puts show : say daunt...
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...,albertsons rebound : . grocer reports double-d...


In [6]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from nltk.cluster import KMeansClusterer, cosine_distance

def call_model(df, model_name):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SentenceTransformer(model_name).to(device)
    
    embeddings = model.encode(df['text'].tolist())
    kclusterer = KMeansClusterer(6, distance=cosine_distance, repeats=10)
    clusters = kclusterer.cluster(embeddings, assign_clusters=True)
    
    df['kmeans_cluster'] = clusters
    return df

all_data = call_model(data, "sentence-transformers/all-mpnet-base-v2")
print(all_data.head())

           id                                              title  \
0  NEWS_00000            Spanish coach facing action in race row   
1  NEWS_00001                  Bruce Lee statue for divided city   
2  NEWS_00002  Only Lovers Left Alive's Tilda Swinton Talks A...   
3  NEWS_00003              Macromedia contributes to eBay Stores   
4  NEWS_00004  Qualcomm plans to phone it in on cellular repairs   

                                            contents  \
0  MADRID (AFP) - Spanish national team coach Lui...   
1  In Bosnia, where one man #39;s hero is often a...   
2  Yasmine Hamdan performs 'Hal' which she also s...   
3  Macromedia has announced a special version of ...   
4  Over-the-air fixes for cell phones comes to Qu...   

                                                text  kmeans_cluster  
0  spanish coach facing action race row : madrid ...               0  
1  bruce lee statue divided city : bosnia , one m...               1  
2  lovers left alive 's tilda swinton tal

In [7]:
all_data

Unnamed: 0,id,title,contents,text,kmeans_cluster
0,NEWS_00000,Spanish coach facing action in race row,MADRID (AFP) - Spanish national team coach Lui...,spanish coach facing action race row : madrid ...,0
1,NEWS_00001,Bruce Lee statue for divided city,"In Bosnia, where one man #39;s hero is often a...","bruce lee statue divided city : bosnia , one m...",1
2,NEWS_00002,Only Lovers Left Alive's Tilda Swinton Talks A...,Yasmine Hamdan performs 'Hal' which she also s...,lovers left alive 's tilda swinton talks almos...,3
3,NEWS_00003,Macromedia contributes to eBay Stores,Macromedia has announced a special version of ...,macromedia contributes ebay stores : macromedi...,4
4,NEWS_00004,Qualcomm plans to phone it in on cellular repairs,Over-the-air fixes for cell phones comes to Qu...,qualcomm plans phone cellular repairs : over-t...,4
...,...,...,...,...,...
59995,NEWS_59995,"Dolphins Break Through, Rip Rams For First Win",But that #39;s OK. Because after a 31-14 rout ...,"dolphins break , rip rams first win : ; ok. - ...",0
59996,NEWS_59996,"After Steep Drop, Price of Oil Rises",The freefall in oil prices ended Monday on a s...,"steep drop , price oil rises : freefall oil pr...",2
59997,NEWS_59997,Pro football: Culpepper puts on a show,To say Daunte Culpepper was a little frustrate...,pro football : culpepper puts show : say daunt...,0
59998,NEWS_59998,Albertsons on the Rebound,The No. 2 grocer reports double-digit gains in...,albertsons rebound : . grocer reports double-d...,2


In [9]:
# Get the unique cluster values
clusters = all_data['kmeans_cluster'].unique()

# Print the top 'count' texts for each cluster
for cluster in clusters:
    print(f"Cluster {cluster}:")
    check_category(all_data, cluster, 5)
    print("\n")  # Add a newline for readability


Cluster 0:
spanish coach facing action race row : madrid ( afp ) - spanish national team coach luis aragones faces formal investigation spain ; football federation decided open disciplinary proceedings racist comments thierry henry france arsenal .
time talk baseball : 's time talk serious risks potential benefits building expensive ballpark washington .
game day preview game time : : pm : charlotte , north carolina ( ticker ) -- detroit shock face critical road test saturday take charlotte sting charlotte coliseum .
fischer 's fiancee : marriage plans genuine ( ap ) : ap - former chess champion bobby fischer 's announcement thathe engaged japanese woman could win sympathy among japanese officials help avoid deportation united states , fiancee one supporters said tuesday .
blake leeper wants first american paralympian olympics authors : blake leeper may training rio olympic games found time stop samsung smart lounge talk goals technology help people like compete high level .


Cluster 

In [10]:
category_data

Unnamed: 0,category,info
0,0,Business
1,1,Entertainment
2,2,Politics
3,3,Sports
4,4,Tech
5,5,World


In [11]:
def submit_data(data, submit_name):
    mapping_dict = {
    0: 3,
    1: 5,
    2: 0,
    3: 1,
    4: 4,
    5: 2
}
    data['mapping'] = data['kmeans_cluster'].apply(lambda x : mapping_dict[x])
    sample = pd.read_csv('data/sample_submission.csv')
    sample['category'] = data['mapping'].values
    return sample.to_csv(submit_name, index = False)

submit_data(data, 'submit_final.csv')

# 대회 이후, 상위권 코드 review

In [None]:
import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer
from nltk.cluster import KMeansClusterer, cosine_distance

def call_model(df, model_name):
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = SentenceTransformer(model_name).to(device)
    
    embeddings = model.encode(df['text'].tolist())
    kclusterer = KMeansClusterer(6, distance=cosine_distance, repeats=10)
    clusters = kclusterer.cluster(embeddings, assign_clusters=True)
    
    df['kmeans_cluster'] = clusters
    return df

all_data = call_model(data, "thenlper/gte-large")
print(all_data.head())

In [11]:
# Get the unique cluster values
clusters = all_data['kmeans_cluster'].unique()

def check_category(data, val, count):
    return print(data[data['kmeans_cluster']==val]['text'].head(count))

# Print the top 'count' texts for each cluster
for cluster in clusters:
    print(f"Cluster {cluster}:")
    check_category(all_data, cluster, 5)
    print("\n")  # Add a newline for readability


Cluster 2:
0     spanish coach facing action race row : madrid ...
6     time talk baseball : 's time talk serious risk...
13    game day preview game time : : pm : charlotte ...
16    fischer 's fiancee : marriage plans genuine ( ...
21    blake leeper wants first american paralympian ...
Name: text, dtype: object


Cluster 4:
1     bruce lee statue divided city : bosnia , one m...
29    israel kills palestinians big gaza incursion (...
34    folly sole superpower writ small authors : thi...
56    sadr ; aide denies entering iraqi police najaf...
57    former nazi guard loses canadian court ruling ...
Name: text, dtype: object


Cluster 0:
2     lovers left alive 's tilda swinton talks almos...
10    harry ; argy-bargy : prince charles asked scot...
28    cate blanchett set star lucille ball new biopi...
37    deep impact space probe aims slam comet ( reut...
40    v-i-c-t-o-r-y , missing tiles : missing key pi...
Name: text, dtype: object


Cluster 1:
3     macromedia contributes eba

In [12]:
category_data

Unnamed: 0,category,info
0,0,Business
1,1,Entertainment
2,2,Politics
3,3,Sports
4,4,Tech
5,5,World


In [13]:
def submit_data(data, submit_name):
    mapping_dict = {
    0: 1,
    1: 4,
    2: 3,
    3: 0,
    4: 5,
    5: 2
}
    data['mapping'] = data['kmeans_cluster'].apply(lambda x : mapping_dict[x])
    sample = pd.read_csv('data/sample_submission.csv')
    sample['category'] = data['mapping'].values
    return sample.to_csv(submit_name, index = False)

submit_data(data, 'submit_after.csv')