In [2]:
import pandas as pd
import re
from konlpy.tag import Okt,Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score
from lightgbm import LGBMClassifier


## Data
### train_data.csv
- index: 헤드라인 인덱스
- title: 뉴스 헤드라인
- topic_idx: 뉴스 주제 인덱스 값

### test_data.csv
- index: test 헤드라인 인덱스
- title: test 뉴스 헤드라인

### sample_submisssion.csv
- index: test 헤드라인 인덱스
- topic_idx: 예측해야하는 뉴스 토픽 인덱스 값

### topic_dict.csv
- topic: 실제 뉴스 토픽
- topic_idx: 뉴스 토픽 인덱스 값

### EDA

In [3]:
# 데이터 불러오기
train = pd.read_csv('train_data.csv')

In [4]:
# 데이터 확인
train.tail()

Unnamed: 0,index,title,topic_idx
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,45652,답변하는 배기동 국립중앙박물관장,2
45653,45653,2020 한국인터넷기자상 시상식 내달 1일 개최…특별상 김성후,2


In [5]:
# 결측치 확인
train.isnull().sum()

index        0
title        0
topic_idx    0
dtype: int64

In [6]:
# label 비율 확인
train.topic_idx.value_counts()

4    7629
2    7362
5    6933
6    6751
1    6222
3    5933
0    4824
Name: topic_idx, dtype: int64

### 데이터 전처리

In [7]:
# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

In [12]:
# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        print(word)
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    return " ".join(clean) 

In [None]:
train['title'] = train['title'].apply(lambda x : func(x))

In [9]:
# tf-idf를 이용한 벡터화
def split(text):
    tokens_ko = text.split()
    return tokens_ko

tfidf_vect = TfidfVectorizer(tokenizer=split)
tfidf_vect.fit(train['title'])
tfidf_matrix_train = tfidf_vect.transform(train['title'])



In [10]:
# train/valid 데이터 셋 나누기.
def split_dataset(tfidf,df):
    X_data = tfidf
    y_data = df['topic_idx']

    # stratify=y_data Stratified 기반 분할, train 데이터의 30%를 평가 데이터 셋으로 사용. (70% 데이터 학습에 사용)
    X_train, X_test, y_train, y_test = \
    train_test_split(X_data, y_data, test_size=0.3, random_state=42, stratify=y_data)

    
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = split_dataset(tfidf_matrix_train,train)

### 모델 학습

In [11]:
lgbm = LGBMClassifier(random_state = 42)
lgbm.fit(X_train,y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74984
[LightGBM] [Info] Number of data points in the train set: 31957, number of used features: 2872
[LightGBM] [Info] Start training from score -2.247404
[LightGBM] [Info] Start training from score -1.993067
[LightGBM] [Info] Start training from score -1.824812
[LightGBM] [Info] Start training from score -2.040560
[LightGBM] [Info] Start training from score -1.789166
[LightGBM] [Info] Start training from score -1.884794
[LightGBM] [Info] Start training from score -1.911312


### 모델 평가

In [12]:
pred = lgbm.predict(X_test)
accuracy = accuracy_score(y_test,pred)

print('정확도', accuracy)

정확도 0.810980506680295


### test 데이터 예측

In [13]:
test = pd.read_csv('test_data.csv')

In [14]:
test['title'] = test['title'].apply(lambda x: func(x))

In [15]:
tfidf_matrix_test = tfidf_vect.transform(test['title'])

In [16]:
pred = lgbm.predict(tfidf_matrix_test)

### 제출 파일 생성

In [17]:
submission = pd.read_csv('sample_submission.csv')

In [18]:
submission['topic_idx'] = pred
submission.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [23]:
submission.to_csv('baseline.csv', index = False)

### 제출 API 이용(SSLError 때문에 실패)

In [20]:
!pip install data/dacon_submit_api-0.0.4-py3-none-any.whl

Processing c:\users\hyungjink\documents\스터디\뉴스토픽분류\data\dacon_submit_api-0.0.4-py3-none-any.whl


ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\Hyungjink\\Documents\\스터디\\뉴스토픽분류\\data\\dacon_submit_api-0.0.4-py3-none-any.whl'



In [35]:
# 개인 Token은 계정관리에서 발급 받으실 수 있습니다

from dacon_submit_api import dacon_submit_api 

sub_token = '7218b34b60efac679a489ac2aa9b779e9a6dd71907a9f776a7fdf65704b004e4'

# result = post_submission_file(
# 'baseline.csv', 
# sub_token, 
# '235747',
# 'akni',
# 'test_submisison')

result = dacon_submit_api.post_submission_file(
'baseline.csv', 
sub_token, 
'235747',
'akni',
'test_submisison')

exception occurs. 예외가 발생했습니다 HTTPSConnectionPool(host='openapi.dacon.io', port=443): Max retries exceeded with url: /submission (Caused by SSLError(SSLCertVerificationError(1, '[SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: self signed certificate in certificate chain (_ssl.c:1131)')))


In [27]:
!pip install --upgrade certifi

Collecting certifi
  Obtaining dependency information for certifi from https://files.pythonhosted.org/packages/4c/dd/2234eab22353ffc7d94e8d13177aaa050113286e93e7b40eae01fbf7c3d9/certifi-2023.7.22-py3-none-any.whl.metadata
  Downloading certifi-2023.7.22-py3-none-any.whl.metadata (2.2 kB)
Downloading certifi-2023.7.22-py3-none-any.whl (158 kB)
   ---------------------------------------- 158.3/158.3 kB 4.8 MB/s eta 0:00:00
Installing collected packages: certifi
  Attempting uninstall: certifi
    Found existing installation: certifi 2022.6.15
    Uninstalling certifi-2022.6.15:
      Successfully uninstalled certifi-2022.6.15
Successfully installed certifi-2023.7.22
