<a href="https://colab.research.google.com/github/jwengr/dacon/blob/main/%EC%86%8C%EC%84%A4%20%EC%9E%91%EA%B0%80%20%EB%B6%84%EB%A5%98%20AI%20%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EDA에서 특수문자의 사용빈도나 프랑스문자의 사용빈도가 결과 예측에
도움이 된 다는 것을 알 수 있었습니다.
일반적인 딥러닝에서의 텍스트분석은 주로 특수문자들을 없애거나 최소화하여
문장의 뜻을 맞춥니다.
하지만 우리는 주어진 텍스트에서 작가를 분류해야하며
주어진 텍스트는 서로 비슷합니다.
즉 텍스트간의 미묘한 차이를 구분하는 모델을 만들어야 하며, 특수문자나 프랑스문자에도 집중해야 할 것 입니다.

In [None]:
!pip uninstall lightgbm

In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

In [None]:
!pip install catboost

In [5]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [6]:
defaultpath = 'drive/My Drive/dacon/sosul/dataset'

In [7]:
train_df = pd.read_csv(defaultpath+'/train.csv',encoding='utf-8')

기본 전처리

In [8]:
train_df = train_df[train_df['text'].str.contains('\* \*')==False]

In [9]:
train_df['sentencelen'] = train_df['text'].apply(lambda x: len(x.split('.')))

In [10]:
train_df['charlen'] = train_df['text'].apply(lambda x: len(x))

In [11]:
train_df['c/s'] = train_df['charlen']/(train_df['sentencelen']+1)  ## 0으로 나뉘는것을 방지

In [12]:
train_df['upperlen'] = train_df['text'].apply(lambda x: len(re.findall('[A-Z]',x)))

In [13]:
train_df['u/s'] = train_df['upperlen']/(train_df['sentencelen']+1)  ## 0으로 나뉘는것을 방지

In [14]:
train_df['u/s'] = train_df['upperlen']/(train_df['charlen']+1)  ## 0으로 나뉘는것을 방지

프랑스어가 포함된 문장만 따로 추출

In [54]:
train_df_fr = train_df[train_df['text'].str.contains('[à|ä|ö|î|ù|â|Œ|ç|ê|ü|ñ|ô|Æ|œ|ë|æ|é|Ê|è|ì]')].copy()

char TF-IDF : 특수문자까지 포함하여

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [59]:
char_tfidf_train_df = train_df.copy()
char_tfidfv  = TfidfVectorizer(analyzer='char').fit(char_tfidf_train_df['text'])
enc = char_tfidfv.transform(char_tfidf_train_df['text']).toarray()
char_tfidf_train_df = pd.concat([char_tfidf_train_df.reset_index(),pd.DataFrame(enc)],axis=1).drop(['level_0','index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(char_tfidf_train_df.drop('author',axis=1), char_tfidf_train_df['author'],
                                                    test_size=0.2, random_state=2021,
                                                    stratify=char_tfidf_train_df['author']) 
xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=10000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=10000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=10000,early_stopping_rounds=1000,task_type="GPU",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (1908, 0.5873985593143065)
lgb : 0.5929607002826662
ctb : (3620, 0.6026260599981763)


char tfidf : 프랑스어포함문장만

In [60]:
char_tfidf_train_df = train_df_fr.copy()
char_tfidfv  = TfidfVectorizer(analyzer='char').fit(char_tfidf_train_df['text'])
enc = char_tfidfv.transform(char_tfidf_train_df['text']).toarray()
char_tfidf_train_df = pd.concat([char_tfidf_train_df.reset_index(),pd.DataFrame(enc)],axis=1).drop(['level_0','index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, x_test, y_train, y_test = train_test_split(char_tfidf_train_df.drop('author',axis=1), char_tfidf_train_df['author'],
                                                    test_size=0.2, random_state=2021,
                                                    stratify=char_tfidf_train_df['author']) 
xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=10000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=10000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=10000,early_stopping_rounds=1000,task_type="GPU",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (83, 0.8507462686567164)
lgb : 0.8656716417910447
ctb : (1064, 0.8059701492537313)
