## Import library

In [15]:
import numpy as np
import pandas as pd
import os
import warnings

import re
# import unidecode

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.decomposition import TruncatedSVD

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

warnings.filterwarnings(action="ignore")

DATA_PATH = "/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/"

In [2]:
df_train = pd.read_csv(DATA_PATH+"train.csv")
df_test = pd.read_csv(DATA_PATH+"test.csv")
df_sample = pd.read_csv(DATA_PATH+"sample_submission.csv")

## Preprocessing

In [3]:
# !pip install Unidecode

In [4]:
# 나는 구경도 못했는데 url이 있었다네 아니 뭔
'''
url_pattern = r"https?://\S+|www.\.\S+"
'''

'\nurl_pattern = r"https?://\\S+|www.\\.\\S+"\n'

In [5]:
# 그리고 unicode 변환은 또 뭐야
'''

'''

'\n\n'

In [6]:
# 첫번째 질문
# 그것보다는 불용어?처리를 하는게 좋지 않을까 싶었다. 추가적으로 하자면 .lower()처리정도?
# 그런데 BERT같은 transformer계열 모델을 사용한다면 attention으로 문장 내 단어간 관계을 알 수 있다.
# 그럼 제거 안해도 되나?

# 두번째 질문
# 간단한 방식, TF-IDF나 Word2Vec 같은걸 이용하는 방식도 해보고 싶다.
# 나는 이것들이 뭔지도 잘 모르니까
# 우리의 baseline은 그런 의미에서 간단하다.

In [7]:
def simple_clean_text(text):
    text = str(text)
    text.lower() # 같은 단어의 대,소문자 버전을 동일하게 인식하기 위해
    text = " ".join(text.split()) # 공백을 한칸으로만 깔끔하게 정리
    # TF-IDF나 Word2Vec 같은 임베딩은 공백(" ")을 기준으로 단어를 쪼갬(tokenize)
    # 공백이 2칸 이상이면 빈 문자열 생김
    return text

df_train['clean_text'] = df_train['comment_text'].apply(simple_clean_text)
df_test['clean_text'] = df_test['comment_text'].apply(simple_clean_text)

In [8]:
pd.set_option("display.max_colwidth", None)
df_train.loc[34:43][['comment_text']]

Unnamed: 0,comment_text
34,This bitch is nuts. Who would read a book by a woman.
35,Awesome!
36,Not for long! \n\n(Troll-In-Training since 2016)
37,Commenting for the sake of commenting to rate other comments to better comment on our new comment system.
38,Interesting concept.\n\nHow does Civil Comments plan to monetize their operation?
39,"Pity the menu lost so much vegan food! The Mash Tun was a favorite beer bar for all the delicious tempeh stuff.\nExcited the staff is still there, can't wait to try the new beers."
40,"I already see dozens of ""just commenting to vote on comments to be able to comment"" type of comments.\n\nThis voting system is completely insane- you're assuming people have the extra time to *actually read* multiple non related comments?\n\nI would love to see this system installed on oregonlive.com - they have enough online visitors and an established ""troll"" base to really need this service.\n\nWith as little online activity as wweek has - it will discourage further growth here."
41,Very cool project!
42,Awesome! Signed up just to give this a shot... good luck to your enterprise!
43,"disqus has way more functionality and obviously a huge following - it can increase visitor count significantly.\n\ndisqus also has a moderator feature - a feature that wweek either never wanted to pay their staff to undertake, or never developed a group of people they could trust to do it on their behalf on a volunteer basis."


In [9]:
df_train.loc[34:43][['clean_text']]

Unnamed: 0,clean_text
34,This bitch is nuts. Who would read a book by a woman.
35,Awesome!
36,Not for long! (Troll-In-Training since 2016)
37,Commenting for the sake of commenting to rate other comments to better comment on our new comment system.
38,Interesting concept. How does Civil Comments plan to monetize their operation?
39,"Pity the menu lost so much vegan food! The Mash Tun was a favorite beer bar for all the delicious tempeh stuff. Excited the staff is still there, can't wait to try the new beers."
40,"I already see dozens of ""just commenting to vote on comments to be able to comment"" type of comments. This voting system is completely insane- you're assuming people have the extra time to *actually read* multiple non related comments? I would love to see this system installed on oregonlive.com - they have enough online visitors and an established ""troll"" base to really need this service. With as little online activity as wweek has - it will discourage further growth here."
41,Very cool project!
42,Awesome! Signed up just to give this a shot... good luck to your enterprise!
43,"disqus has way more functionality and obviously a huge following - it can increase visitor count significantly. disqus also has a moderator feature - a feature that wweek either never wanted to pay their staff to undertake, or never developed a group of people they could trust to do it on their behalf on a volunteer basis."


In [11]:
# Encoding : 빠른 실행과 수준확인을 위해서 TF-IDF를 사용
tfidf = TfidfVectorizer(
    max_features = 100000, # 전체 단어 중 상위 10만개 사용 (10만 차원)
    ngram_range = (1,2), # unigram(단일단어) + bigram(연속단어)
    min_df = 3, # 너무 적게 등장하는 단어는 제외
    max_df = 0.9, # 너무 자주 등장하는 단어도 제외
    sublinear_tf = True # 단어 빈도에 로그 적용해 과도한 빈도 영향 완화
    # 일반 TF: 단순 count(1->10->100) / 로그 TF: (1->1+log10->1+log100)
    # 100번 등장한 단어가 1번 등장한 단어보다 100배 더 중요하다는 건 과장일 수 있음
)

X_train = tfidf.fit_transform(df_train['clean_text'])
X_test = tfidf.transform(df_test['clean_text'])

In [12]:
X_train[34:43].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [None]:
# 중간 저장
# save_npz는 sparse 구조 그대로 저장. TF-IDF결과는 sparse임
sparse.save_npz("X_train_tfidf.npz", X_train)
sparse.save_npz("X_test_tfidf.npz", X_test)

# 로드
# X_train = sparse.load_npz("X_train_tfidf.npz")
# X_test  = sparse.load_npz("X_test_tfidf.npz")

In [None]:
# 차원이 10만개는 좀 많을 수도 있으니 차원축소
# 10만개는 PCA처리 -> covariance matrix 연산값이 너무 커짐
# sparse matrix 형태이므로 SVD 사용

svd = TruncatedSVD(n_components=5000, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

In [None]:
# 중간 저장
# savez_compressed는 dense array로 변환해서 저장, SVD결과는 dense임
np.savez_compressed(
    'svd_features.npz',
    X_train_svd=X_train_svd,
    X_test_svd=X_test_svd
)

# 로드
# data = np.load('svd_features.npz')
# X_train_svd = data['X_train_svd']
# X_test_svd = data['X_test_svd']

## Baseline: LightGBM

In [None]:
y = df_train['target'].values

params = {
    'objective':'regression',
    'metric':'rmse', # 학습용 손실함수
    'boosting_type':'gbdt', # 학습방식 = {gbdt,dart,goss}
    'learning_rate':0.05,
    'n_estimators':1000,
    'subsample':0.8,
    'n_jobs':-1,
    'colsample_bytree': 0.8,
    'num_leaves': 127,
    'min_child_samples': 30,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
}

lgbm = lgb.LGBMRegressor(**params)
lgbm.fit(X_train_svd, y)

In [None]:
y_pred = lgbm.predict(X_test_svd)

In [None]:
print(y_pred)

In [None]:
submission = pd.DataFrame({
    'id': df_test['id'],
    'prediction': y_pred
})

submission.to_csv('submission.csv', index=False)