# 02 TF-IDF 

### inverted index 방법 중 가장 대표적인 방법이 **tf-idf**
Inverted index = index, 맨뒤에 단어로 된 index 장을 의미함 

보통은 contents 중심으로 page를 알려주는데 inverted index는 단어를 중심으로 속하는 page를 알려주는 것

**단어를 중심으로 이 단어는 어느 문서에서 있는지를 거꾸로 파악하는 것 => inverted index**  

<br><br>

---

### Why is it important? 
- term의 frequency를 보겠다는 것, 해당 문서에 단어가 몇 개가 있는지 보겠다
- 차원축소 dimension reeuction 해주어야 함
-  -> parameter값을 바꾸면서 최적의 dim을 찾아야 하기 때문

**=> 데이터를 일련의 vector 값으로 바꿔주는 것이기 때문에 벡터로 유사도 분석을 할 수 있다. (벡터 유사도 분석)**

<br><br>

---

만약에 IBM이라는 단어 위주로 문서를 파악하면
단어가 공통적으로 있는 두 문서는 연관성이 있다는 것 
연관성 분석을 할 때 tf-idf로 할 수 있다 

안에서 복합적으로 분석을 하게 됨 

## Data Load & Preproecssing

In [6]:
import numpy as np
import pandas as pd
import os

In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [8]:
path = "gdrive/My Drive/Colab Notebooks/02_Test/data/"
os.listdir(path)

['stumble_upon_evergreen.tsv']

In [9]:
# data load
df = pd.read_table(path+"stumble_upon_evergreen.tsv")
df

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1,1,24,0,5424,170,8,0.152941,0.079130,0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.203490,0.088652,1,1,40,0,4973,187,9,0.181818,0.125448,1
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1,1,55,0,2240,258,11,0.166667,0.057613,1
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1,0,24,0,2737,120,5,0.041667,0.100858,1
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1,1,14,0,12032,162,10,0.098765,0.082569,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,http://techcrunch.com/2010/09/08/kno-raises-46...,8958,"{""title"":""Kno Raises 46 Million More To Build ...",computer_internet,0.651067,3.010526,0.474747,0.222222,0.191919,0.191919,0.474273,0.0,0,0.177043,0,0.256669,0.048780,1,1,38,0,2219,99,11,0.040404,0.071429,0
7391,http://www.uncoached.com/category/why-i-miss-c...,8895,"{""title"":""Why I Miss College "",""body"":""Mar 30 ...",culture_politics,0.14192,2.208054,0.483333,0.246667,0.036667,0.026667,0.558184,0.0,0,0.057377,0,0.218014,0.225962,1,1,34,0,5672,300,4,0.020000,0.109453,0
7392,http://eatthis.menshealth.com/slide/sweet-pota...,1191,"{""title"":""Sweet Potatoes Eat This Not That i'...",recreation,0.196273,2.000000,0.315789,0.171053,0.105263,0.052632,0.692529,0.0,0,0.124122,0,0.248388,0.464286,?,1,43,0,848,76,5,0.434211,0.117647,1
7393,http://naturallyella.com/,5612,"{""title"":""Naturally Ella "",""body"":"" "",""url"":""n...",arts_entertainment,0.617876,1.026316,0.210526,0.052632,0.000000,0.000000,21.000000,-1.0,0,0.097778,0,0.256070,-1.000000,1,0,37,1,386,38,0,0.026316,0.333333,1


In [5]:
df['str'] = df['url'] + " " + df['boilerplate'] + " " + df['boilerplate']

df

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,str
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1,1,24,0,5424,170,8,0.152941,0.079130,0,http://www.bloomberg.com/news/2010-12-23/ibm-p...
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.203490,0.088652,1,1,40,0,4973,187,9,0.181818,0.125448,1,http://www.popsci.com/technology/article/2012-...
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1,1,55,0,2240,258,11,0.166667,0.057613,1,http://www.menshealth.com/health/flu-fighting-...
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1,0,24,0,2737,120,5,0.041667,0.100858,1,http://www.dumblittleman.com/2007/12/10-foolpr...
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1,1,14,0,12032,162,10,0.098765,0.082569,0,http://bleacherreport.com/articles/1205138-the...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,http://techcrunch.com/2010/09/08/kno-raises-46...,8958,"{""title"":""Kno Raises 46 Million More To Build ...",computer_internet,0.651067,3.010526,0.474747,0.222222,0.191919,0.191919,0.474273,0.0,0,0.177043,0,0.256669,0.048780,1,1,38,0,2219,99,11,0.040404,0.071429,0,http://techcrunch.com/2010/09/08/kno-raises-46...
7391,http://www.uncoached.com/category/why-i-miss-c...,8895,"{""title"":""Why I Miss College "",""body"":""Mar 30 ...",culture_politics,0.14192,2.208054,0.483333,0.246667,0.036667,0.026667,0.558184,0.0,0,0.057377,0,0.218014,0.225962,1,1,34,0,5672,300,4,0.020000,0.109453,0,http://www.uncoached.com/category/why-i-miss-c...
7392,http://eatthis.menshealth.com/slide/sweet-pota...,1191,"{""title"":""Sweet Potatoes Eat This Not That i'...",recreation,0.196273,2.000000,0.315789,0.171053,0.105263,0.052632,0.692529,0.0,0,0.124122,0,0.248388,0.464286,?,1,43,0,848,76,5,0.434211,0.117647,1,http://eatthis.menshealth.com/slide/sweet-pota...
7393,http://naturallyella.com/,5612,"{""title"":""Naturally Ella "",""body"":"" "",""url"":""n...",arts_entertainment,0.617876,1.026316,0.210526,0.052632,0.000000,0.000000,21.000000,-1.0,0,0.097778,0,0.256070,-1.000000,1,0,37,1,386,38,0,0.026316,0.333333,1,"http://naturallyella.com/ {""title"":""Naturally ..."


In [10]:
df.dtypes

url                                object
urlid                               int64
boilerplate                        object
alchemy_category                   object
alchemy_category_score             object
avglinksize                       float64
commonlinkratio_1                 float64
commonlinkratio_2                 float64
commonlinkratio_3                 float64
commonlinkratio_4                 float64
compression_ratio                 float64
embed_ratio                       float64
framebased                          int64
frameTagRatio                     float64
hasDomainLink                       int64
html_ratio                        float64
image_ratio                       float64
is_news                            object
lengthyLinkDomain                   int64
linkwordscore                       int64
news_front_page                    object
non_markup_alphanum_characters      int64
numberOfLinks                       int64
numwords_in_url                   

In [11]:
df.columns

Index(['url', 'urlid', 'boilerplate', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio', 'label'],
      dtype='object')

In [12]:
# X_column에는 url을 사용하지 않고, boilerplate와 숫자 column을 사용한다 
# 그러나 가끔 dtype이 object로 되어 있는 경우(str) 나중에 integer, float으로 바꿔준다 
X_column = ['boilerplate', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio']
y_column = 'label'

In [13]:
# preprocessing data
# drop 하는 방식으로도 input_data를 준비할 수 있음 
input_data = df.drop(['url', 'urlid'], axis=1)  
input_data

Unnamed: 0,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1,1,24,0,5424,170,8,0.152941,0.079130,0
1,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.203490,0.088652,1,1,40,0,4973,187,9,0.181818,0.125448,1
2,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1,1,55,0,2240,258,11,0.166667,0.057613,1
3,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1,0,24,0,2737,120,5,0.041667,0.100858,1
4,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1,1,14,0,12032,162,10,0.098765,0.082569,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,"{""title"":""Kno Raises 46 Million More To Build ...",computer_internet,0.651067,3.010526,0.474747,0.222222,0.191919,0.191919,0.474273,0.0,0,0.177043,0,0.256669,0.048780,1,1,38,0,2219,99,11,0.040404,0.071429,0
7391,"{""title"":""Why I Miss College "",""body"":""Mar 30 ...",culture_politics,0.14192,2.208054,0.483333,0.246667,0.036667,0.026667,0.558184,0.0,0,0.057377,0,0.218014,0.225962,1,1,34,0,5672,300,4,0.020000,0.109453,0
7392,"{""title"":""Sweet Potatoes Eat This Not That i'...",recreation,0.196273,2.000000,0.315789,0.171053,0.105263,0.052632,0.692529,0.0,0,0.124122,0,0.248388,0.464286,?,1,43,0,848,76,5,0.434211,0.117647,1
7393,"{""title"":""Naturally Ella "",""body"":"" "",""url"":""n...",arts_entertainment,0.617876,1.026316,0.210526,0.052632,0.000000,0.000000,21.000000,-1.0,0,0.097778,0,0.256070,-1.000000,1,0,37,1,386,38,0,0.026316,0.333333,1


In [14]:
input_data = df[X_column + [y_column]].copy()  #동일한 결과
input_data

# 단어중심 inverted index 
# IBM    0번데이터, 3데이터 (에 속해있다)
# Fully  1번데이터, 7390데이터, 7393데이터 (에 속해있다..)
# ->0 과 3번 데이터는 서로 연관성이 있다, 데이터끼리는 연관성이 있다 => 연관성 분석 가능 

# 2 words, 3 words, range로 묶어서 같은 것 끼리까지 보겠다 
# 3개까지 본다면 더 비슷한 것을 분석해 줄 것임, 붙은 단위 => ngram_range
# IBM Sees  
# title IBM

# IMB Sees enter
# title IBM Sees

Unnamed: 0,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1,1,24,0,5424,170,8,0.152941,0.079130,0
1,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.203490,0.088652,1,1,40,0,4973,187,9,0.181818,0.125448,1
2,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1,1,55,0,2240,258,11,0.166667,0.057613,1
3,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1,0,24,0,2737,120,5,0.041667,0.100858,1
4,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1,1,14,0,12032,162,10,0.098765,0.082569,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,"{""title"":""Kno Raises 46 Million More To Build ...",computer_internet,0.651067,3.010526,0.474747,0.222222,0.191919,0.191919,0.474273,0.0,0,0.177043,0,0.256669,0.048780,1,1,38,0,2219,99,11,0.040404,0.071429,0
7391,"{""title"":""Why I Miss College "",""body"":""Mar 30 ...",culture_politics,0.14192,2.208054,0.483333,0.246667,0.036667,0.026667,0.558184,0.0,0,0.057377,0,0.218014,0.225962,1,1,34,0,5672,300,4,0.020000,0.109453,0
7392,"{""title"":""Sweet Potatoes Eat This Not That i'...",recreation,0.196273,2.000000,0.315789,0.171053,0.105263,0.052632,0.692529,0.0,0,0.124122,0,0.248388,0.464286,?,1,43,0,848,76,5,0.434211,0.117647,1
7393,"{""title"":""Naturally Ella "",""body"":"" "",""url"":""n...",arts_entertainment,0.617876,1.026316,0.210526,0.052632,0.000000,0.000000,21.000000,-1.0,0,0.097778,0,0.256070,-1.000000,1,0,37,1,386,38,0,0.026316,0.333333,1


In [15]:
# 임시로 원본 복사 
df2 = df.copy()

## CounterVectorizer

In [16]:
# CountVectorizer 쓰기 위해서는 빈칸으로 split 단위 변경
# boilerpalce 컬럼이 만약 
# aaa_bbb-cc  ddd ~  
# 위와 같은 형식이면 인식 x -> 무조건 countervectorizer는 빈칸으로 split 단위를 변경해주어야 함, split 단위로 단어를 인식함 

# 아래와 같이 단어 빈도수로 sorting 한 후 상위 200개의 단어만 보겠다 -> max_feature
#  aaa   10000
#  bbb    9999
#  ccc    8800
# ...

#  zzzzz       201
#  zzzzz2       202


from sklearn.feature_extraction.text import CountVectorizer

# 이정도 option이면 충분
cv = CountVectorizer(
        max_features=10,       # word count 순 top 200 개만 생성 (10개만 봐도 됨)
        stop_words='english')  # 불용어 처리  a, an, the 

features = cv.fit_transform(df2['boilerplate'])

# sparse matrix 여서 numpy.ndarray 원 형태로 보기 위해 todense() 사용
features = pd.DataFrame(features.todense())
features

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1,0,0,1,1,0,0,1,1,1
1,1,0,0,2,3,1,0,1,1,1
2,2,0,1,0,0,0,0,0,1,1
3,4,0,0,1,0,3,0,1,1,1
4,1,3,0,5,3,1,0,4,1,1
...,...,...,...,...,...,...,...,...,...,...
7390,1,0,0,4,1,0,0,0,1,1
7391,2,0,0,2,3,0,0,1,1,1
7392,1,0,0,0,0,0,0,0,1,1
7393,1,0,0,0,0,0,0,0,1,1


## TF-IDF

In [17]:
# tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    min_df=3,            # 최소한 단어가 3번은 나와야 포함 -> 보통은 (2,3)
    ngram_range=(1,2),   # 단어 묶음 1개 ~ 2개까지 인덱싱 -> 보통은 (1,2~3), 조금 더 정확성 높은 분석이 가능함, 하나하나 끼리 분석하는 것 보다는 비교하는 것이 많기 때문
    sublinear_tf=True)   # 컬럼마다 평균 분산으로 스케일링 처리, normalize
text = tfidf.fit_transform(input_data['boilerplate'])

In [18]:
text
#7395 행 x 171219 열 -> tf-idf로 분석했기 때문에 column 증가, 단어 묶음 2개짜리로 분석했기 때문 -> 차원 축소해야함  

<7395x171219 sparse matrix of type '<class 'numpy.float64'>'
	with 3015784 stored elements in Compressed Sparse Row format>

In [19]:
# dimensionality reduction
from sklearn.decomposition import TruncatedSVD

n_dims = 3  # -> (3, 4, 5) 높은성능을 보이는 경우가 많음, 2 ~10 개 실험 제일 좋은 dim 을 찾으면 좋습니다.
svd = TruncatedSVD(n_components=n_dims)
text_svd = svd.fit_transform(text)
text_svd  # 7395 x 3 

array([[ 0.19287898,  0.01057361,  0.1071925 ],
       [ 0.18719496,  0.00692093,  0.08302964],
       [ 0.12404277,  0.01296926,  0.05711351],
       ...,
       [ 0.08609493,  0.01154201,  0.02348485],
       [ 0.02655871,  0.28707138, -0.06038042],
       [ 0.0709375 ,  0.06684538,  0.23662348]])

- countervector : 단어 개수 분석
- tf-idf : 단어 개수 분석 + 데이터 벡터화 

나중에 더 단어 vectorizer를 정교하게 하는 방법은 Word Embedding이 된다. 

Word Embedding은 word에 내재된 의미까지 분석해서 vector로 바꿔주는 방식으로 단순 frequency를 바탕으로 vector화 하는 tf-idf보다는 더 좋은 성능을 보여줄 것이다. 

In [20]:
# 단어 개수 분석 : CountVectorizer, TF-IDF
# 데이터를 벡터화: TF-IDF, (CountVectorizer는 x) 
# 나중에 더 벡터화를 정교하게 하는 방법: Word Embedding

In [21]:
text_svd = pd.DataFrame(text_svd)
text_svd = text_svd.add_prefix('boilerplate_')
text_svd

# TF-IDF 분석을 통해서,  문자열을 --> 벡터로 해석해줌
# 0 번데이터  ---> <0.192879, 0.010573, 0.107206> 벡터로 vectorization
# 1 번데이터  ---> <0.187195, 0.006924, 0.083037> 벡터로 vectorization
# ...
# 7394 번데이터  ---> <0.070938, 0.066807,  0.236807> 벡터로 vectorization


# 벡터 Similarity 분석 (예를 들면)
# A데이터 < 1, 1, 0>
# B데이터 < 999 ,999, 999>
# C데이터  < 1, 0,0>
# A 랑 C 가 유사한거를 계산가능 vector distance

Unnamed: 0,boilerplate_0,boilerplate_1,boilerplate_2
0,0.192879,0.010574,0.107193
1,0.187195,0.006921,0.083030
2,0.124043,0.012969,0.057114
3,0.188728,0.004921,0.072861
4,0.198251,0.003650,0.083090
...,...,...,...
7390,0.172473,0.007495,0.083145
7391,0.162991,0.009130,0.065174
7392,0.086095,0.011542,0.023485
7393,0.026559,0.287071,-0.060380


In [22]:
input_data = pd.concat([input_data, text_svd], axis=1)
input_data

Unnamed: 0,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,boilerplate_0,boilerplate_1,boilerplate_2
0,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1,1,24,0,5424,170,8,0.152941,0.079130,0,0.192879,0.010574,0.107193
1,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.203490,0.088652,1,1,40,0,4973,187,9,0.181818,0.125448,1,0.187195,0.006921,0.083030
2,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1,1,55,0,2240,258,11,0.166667,0.057613,1,0.124043,0.012969,0.057114
3,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1,0,24,0,2737,120,5,0.041667,0.100858,1,0.188728,0.004921,0.072861
4,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1,1,14,0,12032,162,10,0.098765,0.082569,0,0.198251,0.003650,0.083090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,"{""title"":""Kno Raises 46 Million More To Build ...",computer_internet,0.651067,3.010526,0.474747,0.222222,0.191919,0.191919,0.474273,0.0,0,0.177043,0,0.256669,0.048780,1,1,38,0,2219,99,11,0.040404,0.071429,0,0.172473,0.007495,0.083145
7391,"{""title"":""Why I Miss College "",""body"":""Mar 30 ...",culture_politics,0.14192,2.208054,0.483333,0.246667,0.036667,0.026667,0.558184,0.0,0,0.057377,0,0.218014,0.225962,1,1,34,0,5672,300,4,0.020000,0.109453,0,0.162991,0.009130,0.065174
7392,"{""title"":""Sweet Potatoes Eat This Not That i'...",recreation,0.196273,2.000000,0.315789,0.171053,0.105263,0.052632,0.692529,0.0,0,0.124122,0,0.248388,0.464286,?,1,43,0,848,76,5,0.434211,0.117647,1,0.086095,0.011542,0.023485
7393,"{""title"":""Naturally Ella "",""body"":"" "",""url"":""n...",arts_entertainment,0.617876,1.026316,0.210526,0.052632,0.000000,0.000000,21.000000,-1.0,0,0.097778,0,0.256070,-1.000000,1,0,37,1,386,38,0,0.026316,0.333333,1,0.026559,0.287071,-0.060380


In [23]:
# 원본 data drop
input_data = input_data.drop(['boilerplate'], axis=1)  
input_data  # 데이터에 ? 있음 -> 처리 필요 (alchemy_category, alchemy_category_score 컬럼 등)

Unnamed: 0,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label,boilerplate_0,boilerplate_1,boilerplate_2
0,business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,0.443783,0.0,0,0.090774,0,0.245831,0.003883,1,1,24,0,5424,170,8,0.152941,0.079130,0,0.192879,0.010574,0.107193
1,recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,0.468649,0.0,0,0.098707,0,0.203490,0.088652,1,1,40,0,4973,187,9,0.181818,0.125448,1,0.187195,0.006921,0.083030
2,health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,0.525448,0.0,0,0.072448,0,0.226402,0.120536,1,1,55,0,2240,258,11,0.166667,0.057613,1,0.124043,0.012969,0.057114
3,health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,0.480725,0.0,0,0.095861,0,0.265656,0.035343,1,0,24,0,2737,120,5,0.041667,0.100858,1,0.188728,0.004921,0.072861
4,sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,0.446143,0.0,0,0.024908,0,0.228887,0.050473,1,1,14,0,12032,162,10,0.098765,0.082569,0,0.198251,0.003650,0.083090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7390,computer_internet,0.651067,3.010526,0.474747,0.222222,0.191919,0.191919,0.474273,0.0,0,0.177043,0,0.256669,0.048780,1,1,38,0,2219,99,11,0.040404,0.071429,0,0.172473,0.007495,0.083145
7391,culture_politics,0.14192,2.208054,0.483333,0.246667,0.036667,0.026667,0.558184,0.0,0,0.057377,0,0.218014,0.225962,1,1,34,0,5672,300,4,0.020000,0.109453,0,0.162991,0.009130,0.065174
7392,recreation,0.196273,2.000000,0.315789,0.171053,0.105263,0.052632,0.692529,0.0,0,0.124122,0,0.248388,0.464286,?,1,43,0,848,76,5,0.434211,0.117647,1,0.086095,0.011542,0.023485
7393,arts_entertainment,0.617876,1.026316,0.210526,0.052632,0.000000,0.000000,21.000000,-1.0,0,0.097778,0,0.256070,-1.000000,1,0,37,1,386,38,0,0.026316,0.333333,1,0.026559,0.287071,-0.060380


In [24]:
# X_column 재정의
X_column = ['boilerplate_0', 'boilerplate_1', 'boilerplate_2', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio']
y_column = 'label'

In [25]:
# filling missing values
input_data = input_data.replace('?', -1)  # fillna 역할

In [26]:
# label encoding
# 여전히 존재하는 문자열 type에 대해서 label encoding으로 object type에 대해서 모두 fit_transfortm을 통해 int로 변환 
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in input_data.columns[input_data.dtypes == object]:
    input_data[i] = le.fit_transform(list(input_data[i]))

In [27]:
input_data.dtypes

alchemy_category                    int64
alchemy_category_score              int64
avglinksize                       float64
commonlinkratio_1                 float64
commonlinkratio_2                 float64
commonlinkratio_3                 float64
commonlinkratio_4                 float64
compression_ratio                 float64
embed_ratio                       float64
framebased                          int64
frameTagRatio                     float64
hasDomainLink                       int64
html_ratio                        float64
image_ratio                       float64
is_news                             int64
lengthyLinkDomain                   int64
linkwordscore                       int64
news_front_page                     int64
non_markup_alphanum_characters      int64
numberOfLinks                       int64
numwords_in_url                     int64
parametrizedLinkRatio             float64
spelling_errors_ratio             float64
label                             

## Data Split

In [28]:
# 데이터 -> train/valid  ,  test
from sklearn.model_selection import train_test_split

tr_val_X, test_X, tr_val_y, test_y = train_test_split(
    input_data[X_column],   # X 입력데이터
    input_data[y_column],   # y 정답데이터
    test_size=0.2,
    shuffle=True,
    random_state=42,
    stratify=input_data[y_column]   # classification 일때는 중요한 옵션
    )

In [29]:
tr_val_data = pd.concat([tr_val_X, tr_val_y], axis=1)
tr_val_data

Unnamed: 0,boilerplate_0,boilerplate_1,boilerplate_2,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,compression_ratio,embed_ratio,framebased,frameTagRatio,hasDomainLink,html_ratio,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
5734,0.045793,0.017472,0.025178,3,4188,44.554054,0.885787,0.756345,0.637056,0.487310,0.516340,-1.0,0,0.041872,0,0.239231,0.179856,1,1,88,1,2368,394,9,0.781726,0.325581,0
3825,0.174869,-0.026568,-0.107883,8,952,1.982249,0.664706,0.276471,0.241176,0.235294,0.529276,0.0,0,0.035629,0,0.194085,0.247706,1,1,23,1,5073,170,4,0.264706,0.098655,1
6327,0.152915,-0.016128,-0.072933,8,3486,1.766816,0.669643,0.294643,0.120536,0.058036,0.334753,0.0,0,0.037092,0,0.177692,0.077586,1,1,43,1,2435,224,7,0.022321,0.056645,1
1335,0.037699,0.109797,0.388224,0,0,1.333333,0.351351,0.081081,0.027027,0.000000,0.596774,0.0,0,0.061093,0,0.256352,0.724138,0,0,18,0,931,37,7,0.108108,0.043478,0
5644,0.157327,0.011385,0.080698,11,876,2.892950,0.752336,0.387850,0.149533,0.084112,0.503015,0.0,0,0.032680,0,0.265150,0.057143,1,1,52,1,4941,428,2,0.086449,0.167598,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1464,0.157606,-0.016660,-0.081437,4,1474,3.000000,0.435897,0.320513,0.198718,0.185897,0.450055,0.0,0,0.038772,0,0.242902,0.234875,1,1,29,1,5244,156,11,0.032051,0.084175,0
4302,0.213511,-0.014040,-0.012865,1,2862,1.351648,0.402299,0.126437,0.106322,0.094828,0.503106,0.0,0,0.019722,0,0.178371,0.158358,1,1,27,1,4396,348,3,0.204023,0.106195,1
6446,0.113467,0.003477,-0.021167,0,0,1.880952,0.710145,0.173913,0.094203,0.036232,0.405585,0.0,0,0.040730,0,0.207611,0.065041,0,1,14,1,7066,138,9,0.282609,0.102740,1
3062,0.074616,0.017380,0.008443,8,3017,0.769231,0.074074,0.000000,0.000000,0.000000,0.636190,0.0,0,0.121739,0,0.319305,0.121622,0,0,67,1,54,27,1,0.000000,0.086420,0


In [30]:
# 데이터 -> train/valid  ,  test
from sklearn.model_selection import train_test_split

train_X, valid_X, train_y, valid_y = train_test_split(
    tr_val_data[X_column],   # X 입력데이터
    tr_val_data[y_column],   # y 정답데이터
    test_size=0.2,
    shuffle=True,
    random_state=42,
    stratify=tr_val_data[y_column]   # classification 일때는 중요한 옵션
    )

In [31]:
print(len(train_X))
print(len(valid_X))
print(len(test_X))
len(train_X) + len(valid_X) + len(test_X) # 20

4732
1184
1479


7395

## Modeling - XGBClassifier

In [32]:
# train model
from xgboost import XGBClassifier

xgb = XGBClassifier(learning_rate=0.01,
    max_depth=8,
    colsample_bytree=0.8,
    n_estimators=1000)
xgb.fit(train_X, train_y, eval_set=[(valid_X, valid_y)])

[0]	validation_0-error:0.241554
[1]	validation_0-error:0.236486
[2]	validation_0-error:0.220439
[3]	validation_0-error:0.211149
[4]	validation_0-error:0.221284
[5]	validation_0-error:0.223818
[6]	validation_0-error:0.216216
[7]	validation_0-error:0.213682
[8]	validation_0-error:0.219595
[9]	validation_0-error:0.217061
[10]	validation_0-error:0.221284
[11]	validation_0-error:0.221284
[12]	validation_0-error:0.216216
[13]	validation_0-error:0.213682
[14]	validation_0-error:0.215372
[15]	validation_0-error:0.216216
[16]	validation_0-error:0.222973
[17]	validation_0-error:0.221284
[18]	validation_0-error:0.222973
[19]	validation_0-error:0.21875
[20]	validation_0-error:0.216216
[21]	validation_0-error:0.220439
[22]	validation_0-error:0.221284
[23]	validation_0-error:0.224662
[24]	validation_0-error:0.220439
[25]	validation_0-error:0.222128
[26]	validation_0-error:0.225507
[27]	validation_0-error:0.222973
[28]	validation_0-error:0.222128
[29]	validation_0-error:0.222973
[30]	validation_0-err

XGBClassifier(colsample_bytree=0.8, learning_rate=0.01, max_depth=8,
              n_estimators=1000)

In [34]:
# classification prediction (예측)
xgb_pred = xgb.predict(test_X)
#pred_y = model_rf.predict(test_X)
xgb_pred

array([0, 0, 1, ..., 1, 1, 1])

In [35]:
true_y = test_y
true_y

1543    1
3316    0
991     1
305     0
1980    0
       ..
7247    1
4854    1
6293    1
603     1
5803    1
Name: label, Length: 1479, dtype: int64

In [36]:
from sklearn.metrics import classification_report

print(classification_report(true_y, xgb_pred))

              precision    recall  f1-score   support

           0       0.75      0.83      0.79       720
           1       0.82      0.74      0.77       759

    accuracy                           0.78      1479
   macro avg       0.78      0.78      0.78      1479
weighted avg       0.78      0.78      0.78      1479



In [37]:
#y_column 'label'  0 1