In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stumbleupon/test.tsv
/kaggle/input/stumbleupon/train.tsv
/kaggle/input/stumbleupon/sampleSubmission.csv
/kaggle/input/stumbleupon/raw_content.zip


In [2]:
train = pd.read_table("/kaggle/input/stumbleupon/train.tsv")
test = pd.read_table("/kaggle/input/stumbleupon/test.tsv")

In [3]:
# 텍스트 정보추출 크게 2가지 방법 : 1. 머신러닝으로(카운트기반) 2. 딥러닝(맥락)
# 일단 오늘은 머신러닝!
# 1.1 countvectorizer 이게 여기선 점수 좀 밀림.(why? 쓸데없는 단어들이 빈도수높아서..) 1.2 tfidf
# countvectorizer : 그냥 ㄹㅇ 카운트
# tfidf : 모든 문서에 계속 등장하는 단어는 가중치를 낮춤
# countvectorizer vs tfidf 참조 https://www.quora.com/What-is-the-difference-between-TfidfVectorizer-and-CountVectorizer-1

In [4]:
############## 예시 ###################
# countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
corpus = ['you know I want your love. because I love you.']
vector = CountVectorizer()
print(vector.fit_transform(corpus).todense()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.

[[1 1 2 1 2 1]]
{'you': 4, 'know': 1, 'want': 3, 'your': 5, 'love': 2, 'because': 0}


In [5]:
# 문장 데이터 자체를 corpus라 한다.

In [6]:
# 두 개 이상의 문서
corpus = ['you know I want your love. because I love you.',"I don't know you. but I love you."]
vector = CountVectorizer()
print(vector.fit_transform(corpus).todense()) # 코퍼스로부터 각 단어의 빈도 수를 기록한다.
print(vector.vocabulary_) # 각 단어의 인덱스가 어떻게 부여되었는지를 보여준다.

[[1 0 0 1 2 1 2 1]
 [0 1 1 1 1 0 2 0]]
{'you': 6, 'know': 3, 'want': 5, 'your': 7, 'love': 4, 'because': 0, 'don': 2, 'but': 1}


In [7]:
# 불용어(stop words) 직접 지정해서 할 수 있고
from sklearn.feature_extraction.text import CountVectorizer
text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words=["the", "a", "an", "is", "not"])  #얘네 제거하고 인코딩
print(vect.fit_transform(text).toarray()) 
print(vect.vocabulary_)

[[1 1 1 1 1]]
{'family': 1, 'important': 2, 'thing': 4, 'it': 3, 'everything': 0}


In [8]:
# CountVectorizer 라이브러리에서 자체 지원하는 불용어 사용해도 된다.
from sklearn.feature_extraction.text import CountVectorizer
text=["Family is not an important thing. It's everything."]
vect = CountVectorizer(stop_words="english")
print(vect.fit_transform(text).toarray())
print(vect.vocabulary_)

#지정된 불용어가 제외되어 나온 단어들을 확인할 수 잇따.

[[1 1 1]]
{'family': 0, 'important': 1, 'thing': 2}


In [9]:
# tfidf 직접구현해보기
import pandas as pd # 데이터프레임 사용을 위해
from math import log # IDF 계산을 위해
docs = [
  '먹고 싶은 사과',
  '먹고 싶은 바나나',
  '길고 노란 바나나 바나나',
  '저는 과일이 좋아요'
] 
vocab = list(set(w for doc in docs for w in doc.split()))
vocab.sort()

In [10]:
[w for doc in docs for w in doc.split()]

['먹고',
 '싶은',
 '사과',
 '먹고',
 '싶은',
 '바나나',
 '길고',
 '노란',
 '바나나',
 '바나나',
 '저는',
 '과일이',
 '좋아요']

In [11]:
for doc in docs :
    for w in doc.split() :
        print(w)

먹고
싶은
사과
먹고
싶은
바나나
길고
노란
바나나
바나나
저는
과일이
좋아요


In [12]:
vocab   #

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [13]:
# tf구하기
result = []
N = len(docs) # 총 문서의 수

def tf(t, d): # tf(d,t) : 특정 문서 d에서의 특정 단어 t의 등장 횟수.
    return d.count(t)
def idf(t): # idf(d, t) : df(t)에 반비례하는 수. (df(t) : 특정 단어 t가 등장한 문서의 수.)
    df = 0
    for doc in docs:
        df += t in doc
    return log(N/(df + 1))
def tfidf(t, d): # tf(d,t) * idf(d, t)
    return tf(t,d) * idf(t)

In [14]:
for i in range(N): # 각 문서에 대해서 아래 명령을 수행
    result.append([])  #빈 리스트를 넣어주면서
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]        
        result[-1].append(tf(t, d))  #만들어준 빈 리스트에 정보 추가
tf_ = pd.DataFrame(result, columns = vocab)
tf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0,0,0,1,0,1,1,0,0
1,0,0,0,1,1,0,1,0,0
2,0,1,1,0,2,0,0,0,0
3,1,0,0,0,0,0,0,1,1


In [15]:
# idf구하기
result = []
for j in range(len(vocab)):
    t = vocab[j]
    result.append(idf(t))
idf_ = pd.DataFrame(result, index = vocab, columns = ["IDF"])
idf_

Unnamed: 0,IDF
과일이,0.693147
길고,0.693147
노란,0.693147
먹고,0.287682
바나나,0.287682
사과,0.693147
싶은,0.287682
저는,0.693147
좋아요,0.693147


In [16]:
# tfidf 만들기
result = []
for i in range(N):
    result.append([])
    d = docs[i]
    for j in range(len(vocab)):
        t = vocab[j]
        result[-1].append(tfidf(t,d))
tfidf_ = pd.DataFrame(result, columns = vocab)
tfidf_

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
0,0.0,0.0,0.0,0.287682,0.0,0.693147,0.287682,0.0,0.0
1,0.0,0.0,0.0,0.287682,0.287682,0.0,0.287682,0.0,0.0
2,0.0,0.693147,0.693147,0.0,0.575364,0.0,0.0,0.0,0.0
3,0.693147,0.0,0.0,0.0,0.0,0.0,0.0,0.693147,0.693147


In [17]:
#################### 예시 끝 #########################
train= pd.read_table("/kaggle/input/stumbleupon/train.tsv")
test = pd.read_table("/kaggle/input/stumbleupon/test.tsv")
all_data = pd.concat([train,test])
all_data

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio,label
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,1,1,24,0,5424,170,8,0.152941,0.079130,0.0
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,...,1,1,40,0,4973,187,9,0.181818,0.125448,1.0
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,1,1,55,0,2240,258,11,0.166667,0.057613,1.0
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,...,1,0,24,0,2737,120,5,0.041667,0.100858,1.0
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,...,1,1,14,0,12032,162,10,0.098765,0.082569,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3166,http://busy-mommy.com/2012/02/peep-brownie-smo...,7264,"{""title"":""Peep Brownie S mores Busy Mommy An I...",?,?,1.666667,0.376623,0.129870,0.116883,0.090909,...,1,0,16,0,2772,77,3,0.012987,0.063401,
3167,http://www.cannabissearch.com/edibles/cheesecake/,9714,"{""url"":""cannabissearch edibles cheesecake"",""ti...",?,?,1.305556,0.654321,0.123457,0.024691,0.000000,...,1,0,6,0,6058,81,2,0.333333,0.061995,
3168,http://www.tastespotting.com/popular/views/all...,5903,"{""title"":""Most Viewed Submissions All Time mos...",?,?,0.717277,0.291667,0.182292,0.000000,0.000000,...,1,0,19,0,2876,192,4,0.177083,0.117647,
3169,http://lifehacker.com/5839197/how-to-get-a-ful...,3176,"{""title"":""How to Get a Complete Workout with N...",sports,0.424304,0.940000,0.183333,0.066667,0.016667,0.016667,...,1,1,3,0,21029,180,12,0.333333,0.111966,


In [18]:
# 예시로 하나 봐보기
all_data["boilerplate"].iloc[0]

'{"title":"IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries","body":"A sign stands outside the International Business Machines Corp IBM Almaden Research Center campus in San Jose California Photographer Tony Avelar Bloomberg Buildings stand at the International Business Machines Corp IBM Almaden Research Center campus in the Santa Teresa Hills of San Jose California Photographer Tony Avelar Bloomberg By 2015 your mobile phone will project a 3 D image of anyone who calls and your laptop will be powered by kinetic energy At least that s what International Business Machines Corp sees in its crystal ball The predictions are part of an annual tradition for the Armonk New York based company which surveys its 3 000 researchers to find five ideas expected to take root in the next five years IBM the world s largest provider of computer services looks to Silicon Valley for input gleaning many ideas from its Almaden research center in San Jose

In [19]:
# countvectorizer
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(stop_words="english")

all_data = pd.concat([train,test])

text_cv = cv.fit_transform(all_data['boilerplate'])
text_cv #10566:데이터개수 / 112443: 단어(칼럼)의 개수 / 1604239: 0101애들중에 1인 애들의 개수
#모든 단어를 전부 뽑아 Matrix가 매우 큼
#sparse matrix : 0은 엄청 많고 1은 적은 matrix

<10566x112443 sparse matrix of type '<class 'numpy.int64'>'
	with 1604239 stored elements in Compressed Sparse Row format>

In [20]:
#넘 칼럼이 많으니 차원을 줄여주자
#보통 pca많이 쓰는데 여기서는 잘 적용이 안됨
#sparse-matrix로 돼있는건 pca가 잘 안먹힘
#다른 축소차원기법: svd
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=10,) #기본값이 2차원
text_cv_svd = svd.fit_transform(text_cv)
text_cv_svd = pd.DataFrame(text_cv_svd)

In [21]:
text_cv_svd
# 112443 -> 9개의 차원으로

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.791452,0.068957,6.738637,0.383740,4.851918,-0.160044,-5.817199,-2.539243,-1.096687,0.728767
1,0.208572,0.042441,3.971589,-0.169409,2.019196,-0.055349,-3.715963,-1.949369,-0.547098,0.514779
2,0.562854,0.026670,2.435759,-0.453312,1.119426,0.168710,-3.365606,1.098551,0.188573,0.333942
3,0.222592,0.056051,5.309357,-0.805814,2.542797,-0.010479,-6.245590,-1.938782,-0.902599,0.802391
4,0.677782,0.132775,13.579625,-0.637657,7.279902,-0.375326,-7.671273,-4.678736,-2.818853,1.471533
...,...,...,...,...,...,...,...,...,...,...
10561,0.155965,0.044345,4.766776,-1.330140,-0.099949,0.063093,0.135859,-1.246473,0.252322,0.973119
10562,0.226808,0.085823,9.445367,-3.548540,-3.151721,-0.052373,3.978571,-1.411462,-0.159815,-1.332187
10563,0.958629,0.033919,3.078773,-0.696768,0.613691,-0.143718,-0.820087,2.262586,3.159019,1.217277
10564,2.684732,0.455502,43.359214,-6.032231,14.301714,5.033083,-39.113974,-12.463495,-7.035594,2.558320


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer #text말고 image도 있음
tfidf = TfidfVectorizer(min_df=4,ngram_range=(1, 2),sublinear_tf=True) #최소한 설정해줘야할옵션3개.  
# min_df=1(최소단어등장횟수) 최소3은 넣어줘야 유의미한 결과 나옴
# ngram_range=(1, 1), 윈도우사이즈 (conv1느낌) 한번에 단어를 인식을할때 몇개씩 인식할거냐
# sublinear_tf=False 칼럼마다 스케일링해주는것

text = tfidf.fit_transform(all_data["boilerplate"])

In [23]:
text  #10566:데이터개수 / 166221: 단어의개수(칼럼의개수) / 4458852: 0101애들중에 1인 애들의 개수

<10566x166221 sparse matrix of type '<class 'numpy.float64'>'
	with 4259319 stored elements in Compressed Sparse Row format>

In [24]:
from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=10,) #기본값이 2차원
text_svd = svd.fit_transform(text)
text_svd = pd.DataFrame(text_svd)

In [25]:
all_data = all_data.reset_index(drop=True)
all_data = pd.concat([all_data,text_svd],1)

In [26]:
all_data

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,0,1,2,3,4,5,6,7,8,9
0,http://www.bloomberg.com/news/2010-12-23/ibm-p...,4042,"{""title"":""IBM Sees Holographic Calls Air Breat...",business,0.789131,2.055556,0.676471,0.205882,0.047059,0.023529,...,0.194449,0.005235,0.084609,-0.094650,-0.031445,0.008833,-0.011136,-0.025988,0.066549,-0.024106
1,http://www.popsci.com/technology/article/2012-...,8471,"{""title"":""The Fully Electronic Futuristic Star...",recreation,0.574147,3.677966,0.508021,0.288770,0.213904,0.144385,...,0.188016,0.002702,0.063335,-0.079508,-0.022770,-0.010538,-0.016477,0.004191,0.020001,-0.008537
2,http://www.menshealth.com/health/flu-fighting-...,1164,"{""title"":""Fruits that Fight the Flu fruits tha...",health,0.996526,2.382883,0.562016,0.321705,0.120155,0.042636,...,0.125021,0.009351,0.043295,-0.056254,0.045958,-0.058625,0.061834,-0.028810,-0.001615,0.005609
3,http://www.dumblittleman.com/2007/12/10-foolpr...,6684,"{""title"":""10 Foolproof Tips for Better Sleep ""...",health,0.801248,1.543103,0.400000,0.100000,0.016667,0.000000,...,0.190655,0.001693,0.054320,-0.075772,0.005264,-0.058979,0.028077,0.042394,-0.006987,0.013973
4,http://bleacherreport.com/articles/1205138-the...,9006,"{""title"":""The 50 Coolest Jerseys You Didn t Kn...",sports,0.719157,2.676471,0.500000,0.222222,0.123457,0.043210,...,0.207039,0.000034,0.069111,-0.065836,-0.032561,0.017458,-0.050292,0.010379,-0.009556,-0.013124
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10561,http://busy-mommy.com/2012/02/peep-brownie-smo...,7264,"{""title"":""Peep Brownie S mores Busy Mommy An I...",?,?,1.666667,0.376623,0.129870,0.116883,0.090909,...,0.182872,-0.014819,-0.038749,0.039112,-0.033331,0.002480,0.004074,0.004591,-0.010705,0.006324
10562,http://www.cannabissearch.com/edibles/cheesecake/,9714,"{""url"":""cannabissearch edibles cheesecake"",""ti...",?,?,1.305556,0.654321,0.123457,0.024691,0.000000,...,0.218038,-0.034453,-0.108688,0.119526,-0.074611,-0.013505,0.010178,-0.034284,0.020590,-0.010074
10563,http://www.tastespotting.com/popular/views/all...,5903,"{""title"":""Most Viewed Submissions All Time mos...",?,?,0.717277,0.291667,0.182292,0.000000,0.000000,...,0.076611,0.015167,0.005281,0.015468,0.020216,0.090710,0.060609,0.007143,-0.017169,0.001327
10564,http://lifehacker.com/5839197/how-to-get-a-ful...,3176,"{""title"":""How to Get a Complete Workout with N...",sports,0.424304,0.940000,0.183333,0.066667,0.016667,0.016667,...,0.282922,-0.015433,0.060936,-0.096585,-0.006095,-0.118421,0.008446,0.060639,-0.003069,0.029675


In [27]:
all_data2 = all_data.drop(["url","urlid","boilerplate","label"],1)
all_data2 = all_data2.replace("?",-1) #여러개할때는 중괄호

In [28]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in all_data2.columns[all_data2.dtypes== "object"]:
    all_data2[i] = le.fit_transform(list(all_data2[i]))
train2 = all_data2[:len(train)]
test2 = all_data2[len(train):]

In [29]:
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(n_jobs=4,random_state=1234)
rf.fit(train2,(train["label"]))
result = rf.predict(test2)

In [30]:
sub = pd.read_csv("/kaggle/input/stumbleupon/sampleSubmission.csv")
sub["label"] = result
sub.to_csv("submission.csv",index = 0)