<a href="https://colab.research.google.com/github/jwengr/dacon/blob/main/%EC%86%8C%EC%84%A4%20%EC%9E%91%EA%B0%80%20%EB%B6%84%EB%A5%98%20AI%20%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

EDA에서 특수문자의 사용빈도나 프랑스문자의 사용빈도가 결과 예측에
도움이 된 다는 것을 알 수 있었습니다.
일반적인 딥러닝에서의 텍스트분석은 주로 특수문자들을 없애거나 최소화하여
문장의 뜻을 맞춥니다.
하지만 우리는 주어진 텍스트에서 작가를 분류해야하며
주어진 텍스트는 서로 비슷합니다.
즉 텍스트간의 미묘한 차이를 구분하는 모델을 만들어야 하며, 특수문자나 프랑스문자에도 집중해야 할 것 입니다.

In [1]:
!pip uninstall -y lightgbm

Uninstalling lightgbm-2.2.3:
  Successfully uninstalled lightgbm-2.2.3


In [None]:
!git clone --recursive https://github.com/Microsoft/LightGBM

In [None]:
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

In [None]:
!pip install catboost

In [5]:
import pandas as pd
import numpy as np
import re
import xgboost as xgb
import catboost as ctb
import lightgbm as lgb
from sklearn.model_selection import train_test_split

In [6]:
defaultpath = 'drive/My Drive/dacon/sosul/dataset'

In [7]:
train_df = pd.read_csv(defaultpath+'/train.csv',encoding='utf-8')

기본 전처리

In [8]:
train_df = train_df[train_df['text'].str.contains('\* \*')==False]

In [9]:
train_df['sentencelen'] = train_df['text'].apply(lambda x: len(x.split('.')))

In [10]:
train_df['charlen'] = train_df['text'].apply(lambda x: len(x))

In [11]:
train_df['c/s'] = train_df['charlen']/(train_df['sentencelen']+1)  ## 0으로 나뉘는것을 방지

In [12]:
train_df['upperlen'] = train_df['text'].apply(lambda x: len(re.findall('[A-Z]',x)))

In [13]:
train_df['u/s'] = train_df['upperlen']/(train_df['sentencelen']+1)  ## 0으로 나뉘는것을 방지

In [14]:
train_df['u/s'] = train_df['upperlen']/(train_df['charlen']+1)  ## 0으로 나뉘는것을 방지

프랑스어가 포함된 문장만 따로 추출

In [15]:
train_df_fr = train_df[train_df['text'].str.contains('[à|ä|ö|î|ù|â|Œ|ç|ê|ü|ñ|ô|Æ|œ|ë|æ|é|Ê|è|ì]')].copy()

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

char TF-IDF : 특수문자까지 포함하여

In [38]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
char_tfidfv  = TfidfVectorizer(analyzer='char').fit(train['text'])
train_enc = char_tfidfv.transform(train['text']).toarray()
test_enc = char_tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=10000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=10000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=10000,early_stopping_rounds=1000,task_type="GPU",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (1835, 0.5892222120908179)
lgb : 0.5890398468131668
ctb : (3695, 0.6025348773593507)


char tfidf : 프랑스어포함문장만

In [87]:
train, test = train_test_split(train_df_fr.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df_fr['author']) 
char_tfidfv  = TfidfVectorizer(analyzer='char').fit(train['text'])
train_enc = char_tfidfv.transform(train['text']).toarray()
test_enc = char_tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=10000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=10000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=10000,early_stopping_rounds=1000,task_type="GPU",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (72, 0.8507462686567164)
lgb : 0.8805970149253731
ctb : (1185, 0.8059701492537313)


In [None]:
from sklearn.externals import joblib

In [88]:
joblib.dump(lgb_model,defaultpath+'/model/fr_lgb_f5000.pkl')

['drive/My Drive/dacon/sosul/dataset/model/fr_lgb_f5000.pkl']

TF-IDF : 특수문자까지 포함하여 features 500

In [65]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-zA-Z]+|\W",max_features= 500,lowercase=True).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3621, 0.7249931613020881)
lgb : 0.7318318592140056
ctb : (42139, 0.7353879821282028)


In [66]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="(?=[A-Z]+)|[a-zA-Z]+|\W",max_features= 500,lowercase=False).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3218, 0.7162396279748335)
lgb : 0.7206163946384608
ctb : (39492, 0.7297346585210176)


In [67]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="(?=[A-Z]+)|[a-zA-Z]+|\W",max_features= 500,lowercase=True).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3621, 0.7249931613020881)
lgb : 0.7318318592140056
ctb : (42710, 0.7347497036564238)


In [68]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-z]+|[A-Z]+|\W",max_features= 500,lowercase=False).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3470, 0.7189751071396007)
lgb : 0.7267256314397739
ctb : (40678, 0.7338378772681682)


In [69]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-z]+|[A-Z]+|\W",max_features= 500,lowercase=True).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3621, 0.7249931613020881)
lgb : 0.7318318592140056
ctb : (42159, 0.7352056168505516)


In [70]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-z]+|[A-Z]+|[0-9]+|\W",max_features= 500,lowercase=True).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3621, 0.7249931613020881)
lgb : 0.7318318592140056
ctb : (43731, 0.7344761557399471)


In [71]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-z]+|[A-Z]+|[0-9]+|\W",max_features= 500,lowercase=False).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3470, 0.7189751071396007)
lgb : 0.7267256314397739
ctb : (41143, 0.7342937904622959)


소문자로 변환하여 토큰하는 것이 가장 높았습니다.
다음은 불용어를 비교해보겠습니다.

In [73]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-zA-Z]+|\W",max_features= 500,lowercase=True,stop_words='english').fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=100000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=100000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=100000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (3678, 0.6933527856296161)
lgb : 0.6973648217379411
ctb : (38432, 0.6973648217379411)


결과를 바탕으로 feature개수를 늘려보겠습니다.

TF-IDF : 특수문자까지 포함하여 features 1000

In [74]:
train, test = train_test_split(train_df.drop('index',axis=1),test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-zA-Z]+|\W",max_features= 1000,lowercase=True).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray()
test_enc = tfidfv.transform(test['text']).toarray()
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['index','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=200000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=200000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=200000,early_stopping_rounds=1000,task_type="GPU",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (5306, 0.7466034467037476)
lgb : 0.7523479529497583
ctb : (93908, 0.7561776237804322)


tfidf feature 2000
float 64를 낮춰줍니다

In [75]:
train, test = train_test_split(train_df,test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-zA-Z]+|\W",max_features= 2000,lowercase=True,dtype=np.float32).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray().astype(np.float16)
test_enc = tfidfv.transform(test['text']).toarray().astype(np.float16)
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['level_0','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['level_0','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=500000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=500000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=500000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (6358, 0.7640193307194311)
lgb : 0.776055439044406
ctb : (225452, 0.7732287772408134)


featur 5000

In [76]:
train, test = train_test_split(train_df,test_size=0.2, random_state=2021, stratify=train_df['author']) 
tfidfv = TfidfVectorizer(token_pattern="[a-zA-Z]+|\W",max_features= 5000,lowercase=True,dtype=np.float32).fit(train_df['text'])
train_enc = tfidfv.transform(train['text']).toarray().astype(np.float16)
test_enc = tfidfv.transform(test['text']).toarray().astype(np.float16)
train = pd.concat([train.reset_index(),pd.DataFrame(train_enc)],axis=1).drop(['level_0','text','sentencelen','charlen','upperlen'],axis=1)
test = pd.concat([test.reset_index(),pd.DataFrame(test_enc)],axis=1).drop(['level_0','text','sentencelen','charlen','upperlen'],axis=1)
x_train, y_train, x_test, y_test = train.drop('author',axis=1),train['author'], test.drop('author',axis=1),test['author']

xgb_model = xgb.XGBClassifier(num_class=5,objective='multi:softmax',tree_method='gpu_hist', gpu_id=0,n_estimators=500000)
xgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['merror','mlogloss'],early_stopping_rounds=1000,verbose=False)
lgb_model = lgb.LGBMClassifier(num_class=5,objective='multiclass',device_type='gpu',n_estimators=500000,early_stopping_round=1000)
lgb_model.fit(x_train, y_train, eval_set=[(x_test, y_test)], eval_metric=['multi_error','multi_logloss'],verbose=False)
ctb_model = ctb.CatBoostClassifier(n_estimators=500000,early_stopping_rounds=1000,task_type="GPU",devices="0:1",loss_function='MultiClass')
ctb_model.fit(x_train,y_train,eval_set=[(x_test,y_test)],verbose=False)
print(f'xgb : {xgb_model.best_iteration,xgb_model.score(x_test,y_test)}')
print(f'lgb : {lgb_model.score(x_test,y_test)}')
print(f'ctb : {ctb_model.get_best_iteration(),ctb_model.score(x_test,y_test)}')



xgb : (9514, 0.7890945563964621)
lgb : 0.794383149448345
ctb : (348373, 0.7869973557034741)


In [None]:
joblib.dump(xgb_model,defaultpath+'/model/xgb_f5000.pkl')
joblib.dump(lgb_model,defaultpath+'/model/lgb_f5000.pkl')
joblib.dump(ctb_model,defaultpath+'/model/ctb_f5000.pkl')

feature 10000은 gpu부족과 과적합의 이유로 하지 안겠습니다.