In [None]:
# **1. 데이터 및 라이브러리 불러오기**

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import json
import os
import tqdm

from konlpy.tag import Okt
from konlpy.tag import Mecab
mecab = Mecab('C:\mecab\mecab-ko-dic') # 윈도우에서 설치해서 사전 위치 지정함

import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, accuracy_score,f1_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

train=pd.read_csv('open/train.csv')
test=pd.read_csv('open/test.csv')
sample_submission=pd.read_csv('open/sample_submission.csv')

# **2. 데이터 EDA**

train.head(2)

test.head(2)

sample_submission.head(6)

#데이터 구조 파악
print(train.shape)
print(test.shape)
print(sample_submission.shape)

#심각한 불균형 데이터임을 알 수 있습니다.
train.label.value_counts(sort=False)/len(train)

length=train['과제명'].astype(str).apply(len)
plt.hist(length, bins=50, alpha=0.5, color='r', label='word')
plt.title('histogram of length of task_name')
plt.figure(figsize=(12, 5))
plt.boxplot(length, labels=['counts'], showmeans=True)
print('과제명 길이 최댓값: {}'.format(np.max(length)))
print('과제명 길이 최솟값: {}'.format(np.min(length)))
print('과제명 길이 평균값: {}'.format(np.mean(length)))
print('과제명 길이 중간값: {}'.format(np.median(length)))

length=train['요약문_연구목표'].astype(str).apply(len)
plt.hist(length, bins=50, alpha=0.5, color='r', label='word')
plt.title('histogram of length of summary_object')
plt.figure(figsize=(12, 5))
plt.boxplot(length, labels=['counts'], showmeans=True)
print('요약문_연구목표 길이 최댓값: {}'.format(np.max(length)))
print('요약문_연구목표 길이 최솟값: {}'.format(np.min(length)))
print('요약문_연구목표 길이 평균값: {}'.format(np.mean(length)))
print('요약문_연구목표 길이 중간값: {}'.format(np.median(length)))

length=train['요약문_연구내용'].astype(str).apply(len)
plt.hist(length, bins=50, alpha=0.5, color='r', label='word')
plt.title('histogram of length of summary_content')
plt.figure(figsize=(12, 5))
plt.boxplot(length, labels=['counts'], showmeans=True)
print('요약문_연구내용 길이 최댓값: {}'.format(np.max(length)))
print('요약문_연구내용 길이 최솟값: {}'.format(np.min(length)))
print('요약문_연구내용 길이 평균값: {}'.format(np.mean(length)))
print('요약문_연구내용 길이 중간값: {}'.format(np.median(length)))

length=train['요약문_기대효과'].astype(str).apply(len)
plt.hist(length, bins=50, alpha=0.5, color='r', label='word')
plt.title('histogram of length of summary_effect')
plt.figure(figsize=(12, 5))
plt.boxplot(length, labels=['counts'], showmeans=True)
print('요약문_기대효과 길이 최댓값: {}'.format(np.max(length)))
print('요약문_기대효과 길이 최솟값: {}'.format(np.min(length)))
print('요약문_기대효과 길이 평균값: {}'.format(np.mean(length)))
print('요약문_기대효과 길이 중간값: {}'.format(np.median(length)))

# **3. 데이터 전처리**

#해당 baseline 에서는 과제명 columns만 활용했습니다.
#다채로운 변수 활용법으로 성능을 높여주세요!
train = train[['과제명','label']]
test = test[['과제명']]

train.head(2)

test.head(2)

#1. re.sub 한글 및 공백을 제외한 문자 제거
#2. okt 객체를 활용해 형태소 단위로 나눔
#3. remove_stopwords로 불용어 제거 

def preprocessing(text, analyzer, remove_stopwords=False, stop_words=[]):
    text=re.sub("[^가-힣ㄱ-ㅎㅏ-ㅣ]","", text)
    if analyzer=='okt':
        analyzer = Okt()
        word_text=analyzer.morphs(text, stem=True)
    elif analyzer=='mecab':
        analyzer = Mecab('C:\mecab\mecab-ko-dic')
        word_text=analyzer.morphs(text)
    if remove_stopwords:
        word_review=[token for token in word_text if not token in stop_words]
    return word_review

## Mecab

stop_words=['은','는','이','가', '하','아','것','들','의','있','되','수','보','주','등','한']
clean_train_text=[]
clean_test_text=[]

#시간이 많이 걸립니다.
for text in tqdm.tqdm(train['과제명']):
    try:
        clean_train_text.append(preprocessing(text, 'mecab', remove_stopwords=True, stop_words=stop_words))
    except:
        clean_train_text.append([])
    

for text in tqdm.tqdm(test['과제명']):
    if type(text) == str:
        clean_test_text.append(preprocessing(text, 'mecab', remove_stopwords=True, stop_words=stop_words))
    else:
        clean_test_text.append([])

len(clean_train_text)

len(clean_test_text)

from sklearn.feature_extraction.text import CountVectorizer

#tokenizer 인자에는 list를 받아서 그대로 내보내는 함수를 넣어줍니다. 또한 소문자화를 하지 않도록 설정해야 에러가 나지 않습니다.
vectorizer = CountVectorizer(tokenizer = lambda x: x, lowercase=False)
train_features=vectorizer.fit_transform(clean_train_text)
test_features=vectorizer.transform(clean_test_text)
#test데이터에 fit_transform을 할 경우 data leakage에 해당합니다

train_features

test_features

# **4. 모델링**

train_features = train_features.astype('float32')
test_features = test_features.astype('float32')

from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss, accuracy_score, f1_score

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
folds=[]
for train_idx, valid_idx in skf.split(train_features, train['label']):
    folds.append((train_idx, valid_idx))

# random.seed(42)
lgb_models={}
f1 = np.zeros(5)
for fold in range(5):
    print(f'================================== Fold: {fold+1} =====================================')
    trn_idx, val_idx = folds[fold]

    X_trn = train_features[trn_idx]
    y_trn = train['label'][trn_idx]
    
    X_val = train_features[val_idx]
    y_val = train['label'][val_idx] 
    
    lgb = LGBMClassifier(
         boost_from_average=False,
         is_unbalance=True,
         n_estimators=3000, 
         learning_rate=0.1, 
         max_depth=-1,
#          num_leaves=255,
#          min_child_samples=10,
         colsample_bytree=0.3,
         subsample=0.5,
         random_state=42
    )
    
    lgb.fit(X_trn, y_trn, 
            eval_set=[(X_trn, y_trn), (X_val, y_val)], 
            early_stopping_rounds=100,
            verbose=100)
    f1[fold] = f1_score(y_val, lgb.predict(X_val), average='macro')
    lgb_models[fold]=lgb
    print(f'Fold{fold+1} F1 score: {f1[fold]}')
    print(f'================================================================================\n')

f1.mean()

pred = np.zeros((43576, 46))
for fold in range(5):
    pred += lgb_models[fold].predict_proba(test_features)

res = pred.argmax(axis=1)

# **5. 예측 및 제출**

sample_submission['label']=res

sample_submission.label.value_counts()

sample_submission.to_csv('lgb_baseline_20210704.csv', index=False)