# 베이스라인 코드 (LB : 0.774)

## 필요 라이브러리 설치 및 불러오기

In [None]:
!pip install konlpy

In [None]:
import pandas as pd
import re
from konlpy.tag import Okt,Mecab
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score,f1_score
from lightgbm import LGBMClassifier


## EDA

In [None]:
# 데이터 불러오기
train = pd.read_csv('train_data.csv')

In [None]:
# 데이터 확인
train.tail()

Unnamed: 0,index,title,topic_idx
45649,45649,KB금융 미국 IB 스티펠과 제휴…선진국 시장 공략,1
45650,45650,1보 서울시교육청 신종코로나 확산에 개학 연기·휴업 검토,2
45651,45651,게시판 키움증권 2020 키움 영웅전 실전투자대회,1
45652,45652,답변하는 배기동 국립중앙박물관장,2
45653,45653,2020 한국인터넷기자상 시상식 내달 1일 개최…특별상 김성후,2


In [None]:
# 결측치 확인
train.isnull().sum()

index        0
title        0
topic_idx    0
dtype: int64

In [None]:
# label 비율 확인
train.topic_idx.value_counts()

4    7629
2    7362
5    6933
6    6751
1    6222
3    5933
0    4824
Name: topic_idx, dtype: int64

## 데이터 전처리


In [None]:
# 형태소 분석기(Okt) 불러오기 
okt=Okt() 

In [None]:
# 조사, 어미, 구두점 제거
def func(text):
    clean = []
    for word in okt.pos(text, stem=True): #어간 추출
        if word[1] not in ['Josa', 'Eomi', 'Punctuation']: #조사, 어미, 구두점 제외 
            clean.append(word[0])
    
    
    return " ".join(clean) 

train['title'] = train['title'].apply(lambda x : func(x))

In [None]:
# tf-idf를 이용한 벡터화
def split(text):
    tokens_ko = text.split()
    return tokens_ko

tfidf_vect = TfidfVectorizer(tokenizer=split)
tfidf_vect.fit(train['title'])
tfidf_matrix_train = tfidf_vect.transform(train['title'])

In [None]:
# train/valid 데이터 셋 나누기.
def split_dataset(tfidf,df):
    X_data = tfidf
    y_data = df['topic_idx']

    # stratify=y_data Stratified 기반 분할, train 데이터의 30%를 평가 데이터 셋으로 사용. (70% 데이터 학습에 사용)
    X_train, X_test, y_train, y_test = \
    train_test_split(X_data, y_data, test_size=0.3, random_state=42, stratify=y_data)

    
    return (X_train, X_test, y_train, y_test)

X_train, X_test, y_train, y_test = split_dataset(tfidf_matrix_train,train)

## 모델 학습 

In [None]:
lgbm = LGBMClassifier(random_state = 42)
lgbm.fit(X_train,y_train)

LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=42, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

## 모델 평가


In [None]:
pred = lgbm.predict(X_test)
accuracy = accuracy_score(y_test,pred)

print('정확도', accuracy)

정확도 0.8010513251076878


## test 데이터 예측


In [None]:
test = pd.read_csv('test_data.csv')

In [None]:
test['title'] = test['title'].apply(lambda x : func(x)) 

In [None]:
tfidf_matrix_test = tfidf_vect.transform(test['title'])

In [None]:
pred = lgbm.predict(tfidf_matrix_test)

## 제출 파일 생성


In [None]:
submission = pd.read_csv('sample_submission.csv')

In [None]:
submission['topic_idx'] = pred
submission.head()

Unnamed: 0,index,topic_idx
0,45654,2
1,45655,3
2,45656,2
3,45657,2
4,45658,3


In [None]:
submission.to_csv('baseline.csv',index = False)

## 제출 API 이용(선택)
---
👉 [상세 설명 링크 ](https://dacon.io/competitions/official/235747/talkboard/403590)  👈

In [1]:
!wget 'https://bit.ly/3gMPScE'

import zipfile
with zipfile.ZipFile('3gMPScE', 'r') as existing_zip:
    existing_zip.extractall('data')

--2021-06-28 07:55:42--  https://bit.ly/3gMPScE
Resolving bit.ly (bit.ly)... 67.199.248.10, 67.199.248.11
Connecting to bit.ly (bit.ly)|67.199.248.10|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://drive.google.com/uc?export=download&id=1TYnY2KBU9NuR973pB2oqfFCw2dHgpKX7 [following]
--2021-06-28 07:55:42--  https://drive.google.com/uc?export=download&id=1TYnY2KBU9NuR973pB2oqfFCw2dHgpKX7
Resolving drive.google.com (drive.google.com)... 172.217.203.102, 172.217.203.113, 172.217.203.139, ...
Connecting to drive.google.com (drive.google.com)|172.217.203.102|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://doc-14-9o-docs.googleusercontent.com/docs/securesc/ha0ro937gcuc7l7deffksulhg5h7mbp1/29ao32nkuk9huijmrojrdqtj1v7cb96a/1624866900000/00770885000471893693/*/1TYnY2KBU9NuR973pB2oqfFCw2dHgpKX7?e=download [following]
--2021-06-28 07:55:42--  https://doc-14-9o-docs.googleusercontent.com/docs/secur

In [4]:
!pip install data/dacon_submit_api-0.0.4-py3-none-any.whl

Processing ./data/dacon_submit_api-0.0.4-py3-none-any.whl
Installing collected packages: dacon-submit-api
Successfully installed dacon-submit-api-0.0.4


In [None]:
# 개인 Token은 계정관리에서 발급 받으실 수 있습니다

from dacon_submit_api import dacon_submit_api 

result = dacon_submit_api.post_submission_file(
'sample_submission.csv', 
'개인 Token', 
'235747')