## Import library

In [None]:
!pip install unidecode

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings

import re # regular expression, 문자열에서 패턴을 찾아내고, 바꾸고, 분리하는 기능을 제공하는 파이썬 내장 모듈
import unicodedata
import unidecode

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import sparse
from sklearn.decomposition import TruncatedSVD

import lightgbm as lgb
from sklearn.metrics import roc_auc_score

warnings.filterwarnings(action="ignore")

DATA_PATH = "/kaggle/input/jigsaw-unintended-bias-in-toxicity-classification/"

In [None]:
df_train = pd.read_csv(DATA_PATH+"train.csv")
df_test = pd.read_csv(DATA_PATH+"test.csv")
df_sample = pd.read_csv(DATA_PATH+"sample_submission.csv")

## Preprocessing
* ASCII 변환  
  NFKD normalize + uni-decode library로 이상한 문자 의미별 분해 후 근사 변환
  * 유니코드는 모든 문자(한글, 이모지, 악센트 문자 등)를 표현할 수 있는 거대한 문자 체계  
  * ASCII는 옛날 컴퓨터에서 쓰던 아주 기본적인 문자 집합(영어+기초 특수문자). 따라서 단순하게 ASCII 형태로 통일  
* url 제거: 보지는 못했지만, 온라인 댓글이라서 http:// 형태의 url이 많다고 함. toxicity 탐지에 의미는 없으므로 제거
* lower case: 대소문자 및 간격 통일

In [None]:
# ASCII 변환
def ascii_text(text):
    text = str(text)
    
    # 보이지 않는(invisible) 유니코드 문자 탐지 및 제거
    INVISIBLE = (
        r"[\u00AD\u200B\u200C\u200D\u200E\u200F"
        r"\u202C\u202D\u202E\u2060\u2066\u2067\u2068\u2069"
        r"\uFEFF]"
    )
    text = re.sub(INVISIBLE, "", text)
    # re.sub(pattern, replacement, text): text에서 pattern(정규식)을 찾아 대체
    
    # 결합문자 분해
    text = unicodedata.normalize("NFKD", text)
    # ASCII로 변환
    text = unidecode.unidecode(text)
    return text

df_train['clean_text1'] = df_train['comment_text'].apply(ascii_text)
df_test['clean_text1'] = df_test['comment_text'].apply(ascii_text)

In [None]:
# 나는 구경도 못했는데 url이 있었다네 아니 뭔
'''
url_pattern = r"https?://\S+|www\.\S+"
https?://\S+ |(OR) www.\.\S+ 이라면
1. http s?(s가 있거나 없거나) :// \S+(\S는 공백아닌 문자 +(1개 이상))
http://feklsfj.. (O)
https://djfkle.. (O)
https://         (X) 공백이니까
2. www \.(점. 그냥은 못써서 \와 함께 표현) \S+(\S는 공백아닌 문자 +(1개 이상))
www.fjeklfjekl.. (O)
wwwfdfejfklfek.. (X) 점 없으니까
www.             (X) 공백이니까
'''

url_pattern = r"https?://\S+|www\.\S+"
df_train['clean_text2'] = df_train['clean_text1'].str.replace(url_pattern, " ", regex=True)
df_test['clean_text2'] = df_test['clean_text1'].str.replace(url_pattern, " ", regex=True)
# regex=True로 해야 url_pattern이 string이 아니라 정규식이야. 패턴 매칭해!라고 전달
# .str 없이는 .replace() method가 단일 문자열 처리만 가능해서 전체 column Series에 대한 적용을 위해 for문 대신 쓰는 엔진

In [None]:
def simple_clean_text(text):
    # text = str(text)
    text = text.lower() # 같은 단어의 대,소문자 버전을 동일하게 인식하기 위해
    text = " ".join(text.split()) # 공백을 한칸으로만 깔끔하게 정리
    # TF-IDF나 Word2Vec 같은 임베딩은 공백(" ")을 기준으로 단어를 쪼갬(tokenize)
    # 공백이 2칸 이상이면 빈 문자열 생김
    return text

df_train['clean_text3'] = df_train['clean_text2'].apply(simple_clean_text)
df_test['clean_text3'] = df_test['clean_text2'].apply(simple_clean_text)

In [None]:
n=76657
pd.set_option("display.max_colwidth", None)
df_train.loc[n:n+5][['comment_text']]

In [42]:
df_train.loc[n:n+5][['clean_text3']]

Unnamed: 0,clean_text3
76657,my friend just told me about this easiest method of earning money from home. i've just tried it and now i am making $4500 per month without spending too much time. you can alse learn about this trick by the link below -------------------
76658,"dave, where they the ones who came here from eastern europe? or was it western europe a few thousand years later? after some few hundred years i think anyone born in the americas are from this continent. maybe its time we started living what the liberals spew and get over the race thing."
76659,"this nice map clearly suggests an alternative and equally defensible headline: voters west of i5 are warier of school bonds than eastsiders. the pps board needs to understand, as well as those blue pps neighborhoods, that placing another bond on the nov. 2016 ballot will put those who support it in direct opposition to portland's city commissioners who have declared a ""housing crisis."" increasing property taxes, as a bond would do, is anathema to increasing housing affordability. city commissioners and candidates for city council must vigorously oppose any pps bond issue or be required to appear in public with very large signs around their necks saying, ""hypocrite."""
76660,my friend just told me about this easiest method of earning money from home. i've just tried it and now i am making $4500 per month without spending too much time. you can alse learn about this trick by the link below -------------------
76661,"you remind me of a family member of mine who one day upped and left his wife of 20 years and kids, after meeting someone online. he just packed his car and left for another state after he got off of work one day. then a few years later he claimed to had been ""born again"". ever since his supposed conversion, he would and still does, take weekly swipes (like you) at bill clinton on facebook and other media, regarding his white house affair. to this day, things have never been the same with this family member and our other family members. this person never tried to make amends to his wife. he simply believed that all he needed to do was pick up a bible and all was good. the process of forgiveness requires some hard work. i believe the clintons worked on their marriage since those dark days and continue to do so, unlike my die hard ""born again christian"" family member."
76662,"i have experience with tires. the weakest part of any tire is the sidewall and the easiest area to puncture. any puncture in the sidewall of a tire is not repairable, passenger tires, commercial tires and certainly not aviation tires."


## Embedding
* TF-IDF: one-hot & 희소기반중요도

In [43]:
%%time
# 해당 셀 실행후 소요 시간을 출력해줌 (셀 첫 줄에 주석없이 단독으로 있어야 함)

# Encoding : 빠른 실행과 수준확인을 위해서 TF-IDF를 사용
tfidf = TfidfVectorizer(
    max_features = 100000, # 전체 단어 중 상위 10만개 사용 (10만 차원)
    ngram_range = (1,2), # unigram(단일단어) + bigram(연속단어)
    min_df = 3, # 너무 적게 등장하는 단어는 제외
    max_df = 0.9, # 너무 자주 등장하는 단어도 제외
    sublinear_tf = True # 단어 빈도에 로그 적용해 과도한 빈도 영향 완화
    # 일반 TF: 단순 count(1->10->100) / 로그 TF: (1->1+log10->1+log100)
    # 100번 등장한 단어가 1번 등장한 단어보다 100배 더 중요하다는 건 과장일 수 있음
)

X_train = tfidf.fit_transform(df_train['clean_text3'])
X_test = tfidf.transform(df_test['clean_text3'])

CPU times: user 4min 26s, sys: 9.34 s, total: 4min 36s
Wall time: 4min 35s


In [44]:
X_train[34:43].toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
# 중간 저장
# save_npz는 sparse 구조 그대로 저장. TF-IDF결과는 sparse임
sparse.save_npz("X_train_tfidf.npz", X_train)
sparse.save_npz("X_test_tfidf.npz", X_test)

# 로드
# X_train = sparse.load_npz("X_train_tfidf.npz")
# X_test  = sparse.load_npz("X_test_tfidf.npz")

In [None]:
%%time
# 해당 셀 실행후 소요 시간을 출력해줌 (셀 첫 줄에 주석없이 단독으로 있어야 함)

# 차원이 10만개는 좀 많을 수도 있으니 차원축소
# 10만개는 PCA처리 -> covariance matrix 연산값이 너무 커짐
# sparse matrix 형태이므로 SVD 사용

svd = TruncatedSVD(n_components=5000, random_state=42)
X_train_svd = svd.fit_transform(X_train)
X_test_svd = svd.transform(X_test)

In [None]:
# 중간 저장
# savez_compressed는 dense array로 변환해서 저장, SVD결과는 dense임
np.savez_compressed(
    'svd_features.npz',
    X_train_svd=X_train_svd,
    X_test_svd=X_test_svd
)

# 로드
# data = np.load('svd_features.npz')
# X_train_svd = data['X_train_svd']
# X_test_svd = data['X_test_svd']

## Baseline: LightGBM

In [None]:
%%time
# 해당 셀 실행후 소요 시간을 출력해줌 (셀 첫 줄에 주석없이 단독으로 있어야 함)

y = df_train['target'].values

params = {
    'objective':'regression',
    'metric':'rmse', # 학습용 손실함수
    'boosting_type':'gbdt', # 학습방식 = {gbdt,dart,goss}
    'learning_rate':0.05,
    'n_estimators':1000,
    'subsample':0.8,
    'n_jobs':-1,
    'colsample_bytree': 0.8,
    'num_leaves': 127,
    'min_child_samples': 30,
    'reg_alpha':0.1,
    'reg_lambda':0.1,
}

lgbm = lgb.LGBMRegressor(**params)
lgbm.fit(X_train_svd, y)

In [None]:
y_pred = lgbm.predict(X_test_svd)

In [None]:
print(y_pred)

In [None]:
submission = pd.DataFrame({
    'id': df_test['id'],
    'prediction': y_pred
})

submission.to_csv('submission.csv', index=False)