<a href="https://colab.research.google.com/github/im-hjk/dankook_dacon/blob/master/d_201113_%EB%AC%B8%EC%9E%90%EC%97%B4%EA%B0%80%EA%B3%B5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#lib import / setting

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
from matplotlib import pyplot as plt
from matplotlib import rcParams
import seaborn as sns

import io
from pathlib import Path

import pandas as pd
import warnings
import numpy as np

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer

In [4]:
rcParams['figure.figsize'] = (16, 8)
plt.style.use('fivethirtyeight')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
warnings.simplefilter('ignore')

#loading

In [25]:
main_path = Path('/content/drive/My Drive/Colab Notebooks/dacon_author')
feedMe_dir = main_path / 'feed_me' # 기본 feature들 있는 곳
medium_dir =  main_path / 'medium' # 중간 결과물 저장하는 곳
result_dir =  main_path / 'result' # 최종 결과물 저장하는 곳

trn_file = feedMe_dir / 'train.csv'
tst_file = feedMe_dir / 'test_x.csv'
sample_file = feedMe_dir / 'sample_submission.csv'

target_col = 'author'
n_fold = 5
n_class = 5
seed = 42

algo_name = 'lr'
feature_name = 'tfidf'
model_name = f'{algo_name}_{feature_name}'
sub_file = result_dir / f'{model_name}.csv'

feature_file = medium_dir / 'feat' / f'{feature_name}.csv'
p_val_file = medium_dir / 'val' / f'{model_name}.val.csv'
p_tst_file = medium_dir / 'tst' / f'{model_name}.tst.csv'
sub_file = medium_dir / 'sub' / f'{model_name}.csv'

In [6]:
trn = pd.read_csv( trn_file , encoding = 'utf-8')
tst = pd.read_csv( tst_file , encoding = 'utf-8')
sub = pd.read_csv( sample_file , encoding = 'utf-8')

#preprocessing

In [10]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
vec = CountVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 2), min_df=100)
X_cnt = vec.fit_transform(trn['text'])
print(X_cnt.shape)

(54879, 2685)


In [12]:
X_cnt[0, :50].todense()

matrix([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])

In [13]:
vec = TfidfVectorizer(tokenizer=word_tokenize, stop_words=stopwords.words('english'), ngram_range=(1, 3), min_df=50)
X = vec.fit_transform(trn['text'])
X_tst = vec.transform(tst['text'])
print(X.shape, X_tst.shape)

(54879, 5899) (19617, 5899)


In [14]:
X[0, :50].todense()

matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0.]])

#lr modeling

In [19]:
cv = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)

In [20]:
y = trn.author.values
y.shape

(54879,)

In [21]:
p = np.zeros((X.shape[0], n_class))
p_tst = np.zeros((X_tst.shape[0], n_class))
for i_cv, (i_trn, i_val) in enumerate(cv.split(X, y), 1):
    clf = LogisticRegression()
    clf.fit(X[i_trn], y[i_trn])
    p[i_val, :] = clf.predict_proba(X[i_val])
    p_tst += clf.predict_proba(X_tst) / n_class

In [22]:
print(f'Accuracy (CV): {accuracy_score(y, np.argmax(p, axis=1)) * 100:8.4f}%')
print(f'Log Loss (CV): {log_loss(pd.get_dummies(y), p):8.4f}')

Accuracy (CV):  76.5666%
Log Loss (CV):   0.6804


In [26]:
np.savetxt(p_val_file, p, fmt='%.6f', delimiter=',')
np.savetxt(p_tst_file, p_tst, fmt='%.6f', delimiter=',')

#sub f

In [27]:
sub = pd.read_csv(sample_file, index_col=0)
print(sub.shape)
sub.head()

(19617, 5)


Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0


In [28]:
sub[sub.columns] = p_tst
sub.head()

Unnamed: 0_level_0,0,1,2,3,4
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0.0617,0.5324,0.3168,0.064,0.0251
1,0.0787,0.8213,0.0031,0.0279,0.0689
2,0.7131,0.0329,0.1194,0.0381,0.0967
3,0.038,0.0037,0.8445,0.0059,0.1079
4,0.3025,0.2469,0.1425,0.1928,0.1154


In [29]:
sub.to_csv(sub_file)