# STEP 00. Import Libraries

In [40]:
import os, gzip, pickle

import pandas as pd
import numpy as np

from tqdm import *
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from konlpy.tag import *

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

from imblearn.over_sampling import SMOTE 

# STEP 01. Data Load & EDA

In [None]:
train = pd.read_csv("newszum_train_data.csv")
test = pd.read_csv("newszum_test_data.csv")
data = pd.concat([train, test], ignore_index=True)
train

In [None]:
test

In [None]:
train.category.hist(figsize=(20,10))

In [None]:
test.category.hist(figsize=(20,10))

In [None]:
data.category.hist(figsize=(20,10))

In [None]:
data.title.str.len().hist()

In [None]:
data.cleanBody.str.len().hist()

# STEP 02. Tokenizing

## 1. Title

In [None]:
min_df = 2

okt = Okt()

train_morph = []
for i in range(len(train)):
    train_morph.append(" ".join([x for x in okt.nouns(train['title'][i]) if len(x) > 1]))
    
test_morph = []
for i in range(len(test)):
    test_morph.append(" ".join([x for x in okt.nouns(test['title'][i]) if len(x) > 1]))

vectorizer = TfidfVectorizer(analyzer='word',       
                        min_df=min_df,   # 특정 횟수 이상 언급된 것만 따로                      
                        max_features=50000,             
                        )

train_vectorized = vectorizer.fit_transform(train_morph)
test_vectorized = vectorizer.transform(test_morph)

In [None]:
# exmaple
T_words = pd.DataFrame(train_vectorized.toarray(), columns=vectorizer.get_feature_names())
T_words

In [None]:
T_len = len(vectorizer.get_feature_names())
T_train = pd.DataFrame(train_vectorized.toarray(), columns=[i for i in range(T_len)])
T_test = pd.DataFrame(test_vectorized.toarray(), columns=[i for i in range(T_len)])

T_train.shape, T_test.shape

## 2. Body Text

In [12]:
min_df = 5

okt = Okt()

train_morph = []
for i in range(len(train)):
    train_morph.append(" ".join([x for x in okt.nouns(train['cleanBody'][i]) if len(x) > 1]))

test_morph = []
for i in range(len(test)):
    test_morph.append(" ".join([x for x in okt.nouns(test['cleanBody'][i]) if len(x) > 1]))

vectorizer = TfidfVectorizer(analyzer='word',       
                        min_df=min_df,   # 특정 횟수 이상 언급된 것만 따로                      
                        max_features=50000,             
                        )

train_vectorized = vectorizer.fit_transform(train_morph)
test_vectorized = vectorizer.transform(test_morph)

In [13]:
# example
B_words = pd.DataFrame(train_vectorized.toarray(), columns=vectorizer.get_feature_names())
B_words



Unnamed: 0,가가,가게,가격,가결,가구,가기,가까이,가늠,가능,가능성,...,흥행,희망,희망이,희생,흰색,히데,히어로,히어로즈,히트,힐링
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.105369,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.281241,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.026415,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0
2194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
B_len = len(vectorizer.get_feature_names())
B_train = pd.DataFrame(train_vectorized.toarray(), columns=[i+T_len for i in range(B_len)])
B_test = pd.DataFrame(test_vectorized.toarray(), columns=[i+T_len for i in range(B_len)])

B_train.shape, B_test.shape



((2196, 6702), (942, 6702))

## 3. Labels

In [15]:
labels = train.category.unique()

labels_dict = {}
for i in range(len(labels)) :
    labels_dict[labels[i]] = i
labels_dict

{'international': 0,
 'economy': 1,
 'society': 2,
 'sport': 3,
 'it': 4,
 'politics': 5,
 'entertain': 6,
 'culture': 7}

In [16]:
L_train = train['category'].map(lambda x : labels_dict[x])
L_test = test['category'].map(lambda x : labels_dict[x])

## 4. Concat

In [26]:
df_train = pd.concat([T_train, B_train, L_train], axis=1)
df_test = pd.concat([T_test, B_test, L_test], axis=1)
df_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9080,9081,9082,9083,9084,9085,9086,9087,9088,category
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.105369,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.281241,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2191,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,6
2192,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,2
2193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,2
2194,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,6


# STEP 03. Modeling

## 1. Base Modeling

In [18]:


X, y = df_train.iloc[:,:-1], df_train.iloc[:,-1]

rf = RandomForestClassifier()
lr = LogisticRegression()
xgb = XGBClassifier()

rf.fit(X, y)
lr.fit(X, y)
xgb.fit(X, y)

pred1 = rf.predict(df_test.iloc[:,:-1])
pred2 = lr.predict(df_test.iloc[:,:-1])
pred3 = xgb.predict(df_test.iloc[:,:-1])

score1 = accuracy_score(df_test.iloc[:,-1], pred1)
score2 = accuracy_score(df_test.iloc[:,-1], pred2)
score3 = accuracy_score(df_test.iloc[:,-1], pred3)

print(score1, score2, score3)

0.826963906581741 0.8598726114649682 0.8354564755838642


In [19]:
xgb1 = XGBClassifier(grow_policy= 'depthwise', 
                            n_estimators= 500, 
                            num_parallel_tree= 1, 
                            tree_method= 'gpu_hist',
                            predictor= 'gpu_predictor', 
                            n_jobs= -1, 
                            booster= 'dart',
                            rate_drop=0.5,
                            skip_drop=0.1, 
                            subsample=0.8)

xgb2 = XGBClassifier(grow_policy= 'lossguide', 
                            n_estimators= 500, 
                            num_parallel_tree= 1, 
                            tree_method= 'gpu_hist',
                            predictor= 'gpu_predictor', 
                            n_jobs= -1, 
                            subsample=0.8)

xgb3 = XGBClassifier(grow_policy= 'depthwise', 
                            n_estimators= 200, 
                            num_parallel_tree= 3, 
                            tree_method= 'gpu_hist',
                            predictor= 'gpu_predictor', 
                            n_jobs= -1, 
                            booster= 'dart',
                            rate_drop=0.5,
                            skip_drop=0.1, 
                            subsample=0.8)

xgb1.fit(X, y)
xgb2.fit(X, y)
xgb3.fit(X, y)

pred4 = xgb1.predict(df_test.iloc[:,:-1])
pred5 = xgb2.predict(df_test.iloc[:,:-1])
pred6 = xgb3.predict(df_test.iloc[:,:-1])

score4 = accuracy_score(df_test.iloc[:,-1], pred4)
score5 = accuracy_score(df_test.iloc[:,-1], pred5)
score6 = accuracy_score(df_test.iloc[:,-1], pred6)

print(score4, score5, score6)

0.8375796178343949 0.8407643312101911 0.826963906581741


## Data Augmentation

In [20]:
X, y = df_train.iloc[:,:-1], df_train.iloc[:,-1]

sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)

In [22]:
rf = RandomForestClassifier(n_jobs=-1)
lr = LogisticRegression(n_jobs=-1)
xgb = XGBClassifier(n_jobs=-1)

rf.fit(X_res, y_res)
lr.fit(X_res, y_res)
xgb.fit(X_res, y_res)

pred1 = rf.predict(df_test.iloc[:,:-1])
pred2 = lr.predict(df_test.iloc[:,:-1])
pred3 = xgb.predict(df_test.iloc[:,:-1])

score1 = accuracy_score(df_test.iloc[:,-1], pred1)
score2 = accuracy_score(df_test.iloc[:,-1], pred2)
score3 = accuracy_score(df_test.iloc[:,-1], pred3)

print(score1, score2, score3)

0.8428874734607219 0.8747346072186837 0.8428874734607219


In [23]:
xgb1.fit(X_res, y_res)
xgb2.fit(X_res, y_res)
xgb3.fit(X_res, y_res)

pred4 = xgb1.predict(df_test.iloc[:,:-1])
pred5 = xgb2.predict(df_test.iloc[:,:-1])
pred6 = xgb3.predict(df_test.iloc[:,:-1])

score4 = accuracy_score(df_test.iloc[:,-1], pred4)
score5 = accuracy_score(df_test.iloc[:,-1], pred5)
score6 = accuracy_score(df_test.iloc[:,-1], pred6)

print(score4, score5, score6)

0.8471337579617835 0.8503184713375797 0.8152866242038217


## Ensemble

In [27]:
model1, model2, model3 = rf, lr, xgb
model4, model5, model6 = xgb1, xgb2, xgb3

df_train2 = pd.DataFrame()
df_train2['pred1'] = model1.predict(X)
df_train2['pred2'] = model2.predict(X)
df_train2['pred3'] = model3.predict(X)
df_train2['pred4'] = model4.predict(X)
df_train2['pred5'] = model5.predict(X)
df_train2['pred6'] = model6.predict(X)

df_train2

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,pred6
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,1,1,1,1,1,1
3,1,1,1,1,1,1
4,2,2,2,2,2,2
...,...,...,...,...,...,...
2191,6,6,6,6,6,6
2192,2,0,2,2,2,2
2193,2,2,2,2,2,1
2194,6,6,6,6,6,6


In [28]:
df_test2 = pd.DataFrame()
df_test2['pred1'] = model1.predict(df_test.iloc[:,:-1])
df_test2['pred2'] = model2.predict(df_test.iloc[:,:-1])
df_test2['pred3'] = model3.predict(df_test.iloc[:,:-1])
df_test2['pred4'] = model4.predict(df_test.iloc[:,:-1])
df_test2['pred5'] = model5.predict(df_test.iloc[:,:-1])
df_test2['pred6'] = model6.predict(df_test.iloc[:,:-1])
# df_test2['label'] = df_test2_test.iloc[:,-1]

df_test2

Unnamed: 0,pred1,pred2,pred3,pred4,pred5,pred6
0,6,6,6,6,6,6
1,6,6,6,6,6,6
2,1,1,1,1,1,1
3,2,2,2,2,2,2
4,2,2,2,2,2,2
...,...,...,...,...,...,...
937,0,0,0,0,0,0
938,6,6,6,6,6,6
939,5,2,2,5,5,5
940,0,0,5,5,0,2


In [31]:
from sklearn.svm import SVC

model7 = SVC()
model7.fit(df_train2, y)
pred7 = model7.predict(df_test2)
accuracy_score(df_test.iloc[:,-1], pred7)

0.8131634819532909

## GridSearchCV

In [32]:
from sklearn.model_selection import GridSearchCV

params = {'max_iter':[100, 1000],
          'C':[1,10,100, 10000]}

clf = GridSearchCV(lr, param_grid=params)
clf.fit(X_res, y_res)
sorted(clf.cv_results_.keys())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_C',
 'param_max_iter',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [33]:
clf.best_params_

{'C': 100, 'max_iter': 100}

In [34]:
model = LogisticRegression(C=100, n_jobs=-1)
model.fit(X_res, y_res)
pred = model.predict(df_test.iloc[:,:-1])

score = accuracy_score(df_test.iloc[:,-1], pred)
score

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8874734607218684

In [38]:
model2 = LogisticRegression(C=10000, n_jobs=-1)
model2.fit(X_res, y_res)
pred2 = model2.predict(df_test.iloc[:,:-1])

score2 = accuracy_score(df_test.iloc[:,-1], pred2)
score2

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.8895966029723992

# STEP 04. Model Save & Load

In [41]:
save_dir = './models'
if not os.path.exists(save_dir) :
    os.mkdir(save_dir)
    
with gzip.open(f'{save_dir}/LR_tfidf.pickle','wb') as f:
    pickle.dump(model2, f)

In [42]:
del model, model2

In [43]:
with gzip.open(f'{save_dir}/LR_tfidf.pickle','rb') as f:
    model = pickle.load(f)
    
pred = model.predict(df_test.iloc[:,:-1])
score = accuracy_score(df_test.iloc[:,-1], pred)
print(score)

0.8895966029723992


# STEP 05. Inference & Submit

In [44]:
labels_dict

{'international': 0,
 'economy': 1,
 'society': 2,
 'sport': 3,
 'it': 4,
 'politics': 5,
 'entertain': 6,
 'culture': 7}

In [45]:
labels_dict2 = {v:k for k, v in labels_dict.items()}
labels_dict2

{0: 'international',
 1: 'economy',
 2: 'society',
 3: 'sport',
 4: 'it',
 5: 'politics',
 6: 'entertain',
 7: 'culture'}

In [46]:
test['result'] = pred
test['result'] = test['result'].map(lambda x : labels_dict2[x])
test

Unnamed: 0,title,cleanBody,category,result
0,"배진웅 측 ""성폭행 의혹 사실무근…보도 전 B씨 강제추행죄 고소""[공식]",후배 여배우 성폭행 의혹에 휘말린 배우 배진웅 측이 허위사실 유포에 대한 강경대응 ...,entertain,entertain
1,"배진웅 측 ""여배우 성추행 고소내용 허위""(공식)",[조이뉴스24 정명화 기자] 배우 배진웅이 동료여배우 성추행 혐의를 부인했다. 배진...,entertain,entertain
2,매일 쏟아지는 코스피 기록들…'동학개미발 가보지 않는 숫자',11일 서울 영등포구 KB국민은행 여의도지점 스마트딜링룸 전광판에 코스피지수가 전일...,economy,economy
3,"정부, 코로나19 의료진 처우 지적에 ""인력충원 등 개선 지속 논의""",정부가 12일 신종 코로나바이러스 감염증(코로나19) 확산에 따른 의료현장 인력 부...,society,society
4,"집합금지 완화 토요일 발표 전망…중수본 ""엄격한 방역 조건으로 해제 검토""","정부가 11일부터 코로나19로 피해를 입은 소상공인과 고용 취약계층에 버팀목자금, ...",society,society
...,...,...,...,...
937,"日코로나 신규확진 600명…""인슐린주사기로 백신1병에 7회""",일본 수도권 일부 지역에 신종 코로나바이러스 감염증(코로나19) 긴급사태가 발령 중...,society,international
938,[슬라이드 뉴스] 이지은 누구? '금홍아 금홍아'로 신인상 휩쓴 '청춘스타',숨진 채 발견된 배우 이지은에 대해 관심이 쏠리고 있다. 1971년생인 이지은은 1...,entertain,entertain
939,"文 “검찰 공정성 신뢰 나아지지 않아… 기소·수사권 분리, 나아가야 할 방향”","문재인 캐리커처 문재인(얼굴) 대통령은 8일 ""기소권과 수사권 분리는 앞으로도 꾸준...",politics,society
940,"中, '백신여권' 도입에 ""편리한 인원왕래 요구 절실""",[서울=뉴시스] 중국 외교부가 '중국판 백신 여권'에 대해 추가적인 설명을 내놓았다...,international,international


In [47]:
test.to_csv('submission.csv', index=False, encoding='utf-8-sig')
df = pd.read_csv('submission.csv')
df

Unnamed: 0,title,cleanBody,category,result
0,"배진웅 측 ""성폭행 의혹 사실무근…보도 전 B씨 강제추행죄 고소""[공식]",후배 여배우 성폭행 의혹에 휘말린 배우 배진웅 측이 허위사실 유포에 대한 강경대응 ...,entertain,entertain
1,"배진웅 측 ""여배우 성추행 고소내용 허위""(공식)",[조이뉴스24 정명화 기자] 배우 배진웅이 동료여배우 성추행 혐의를 부인했다. 배진...,entertain,entertain
2,매일 쏟아지는 코스피 기록들…'동학개미발 가보지 않는 숫자',11일 서울 영등포구 KB국민은행 여의도지점 스마트딜링룸 전광판에 코스피지수가 전일...,economy,economy
3,"정부, 코로나19 의료진 처우 지적에 ""인력충원 등 개선 지속 논의""",정부가 12일 신종 코로나바이러스 감염증(코로나19) 확산에 따른 의료현장 인력 부...,society,society
4,"집합금지 완화 토요일 발표 전망…중수본 ""엄격한 방역 조건으로 해제 검토""","정부가 11일부터 코로나19로 피해를 입은 소상공인과 고용 취약계층에 버팀목자금, ...",society,society
...,...,...,...,...
937,"日코로나 신규확진 600명…""인슐린주사기로 백신1병에 7회""",일본 수도권 일부 지역에 신종 코로나바이러스 감염증(코로나19) 긴급사태가 발령 중...,society,international
938,[슬라이드 뉴스] 이지은 누구? '금홍아 금홍아'로 신인상 휩쓴 '청춘스타',숨진 채 발견된 배우 이지은에 대해 관심이 쏠리고 있다. 1971년생인 이지은은 1...,entertain,entertain
939,"文 “검찰 공정성 신뢰 나아지지 않아… 기소·수사권 분리, 나아가야 할 방향”","문재인 캐리커처 문재인(얼굴) 대통령은 8일 ""기소권과 수사권 분리는 앞으로도 꾸준...",politics,society
940,"中, '백신여권' 도입에 ""편리한 인원왕래 요구 절실""",[서울=뉴시스] 중국 외교부가 '중국판 백신 여권'에 대해 추가적인 설명을 내놓았다...,international,international
