In [1]:
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

Logistic Regression 로지스틱 회귀는, 선형회귀에 "로지스틱 함수"를 적용해서 0~1 사이의 확률값을 갖도록 한 것! 

In [2]:
DATA_IN_PATH = "./data_in/"
DATA_OUT_PATH = "./data_out/"
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

In [3]:
vectorizer = TfidfVectorizer(min_df=0.0,        # 설정값보다 특정 토큰의 df값이 더 적게 나오면 벡터화 과정에서 해당 토큰 제거
                             analyzer='char',   # 분석 기준 단위 (word, char)
                             """ 여기서 왜 문자 기준이지? """
                             sublinear_tf=True, # 문서의 tf에 대한 스무딩 여부 설정
                             ngram_range=(1,3), # 빈도의 기본 단위를 어느 범위의 ngram으로 설정한 것인지
                             max_features=5000) # 각 벡터의 최대 길이

x = vectorizer.fit_transform(reviews)
y = np.array(sentiments)

In [4]:
x

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 17862871 stored elements in Compressed Sparse Row format>

In [7]:
print(x)

  (0, 2438)	0.019124393661517778
  (0, 147)	0.014386536461969144
  (0, 3451)	0.023470394899376395
  (0, 3291)	0.021033351664016636
  (0, 109)	0.013246698478899236
  (0, 524)	0.0189994752370753
  (0, 1982)	0.03647837010056725
  (0, 2500)	0.023983691074172406
  (0, 4142)	0.017298110578245086
  (0, 2330)	0.019720953426820724
  (0, 877)	0.024647733930689572
  (0, 1999)	0.019344769593043414
  (0, 227)	0.014537297382228958
  (0, 966)	0.017375417366561813
  (0, 3481)	0.0302673439409688
  (0, 4493)	0.03155689512192274
  (0, 4318)	0.03186899853807077
  (0, 2700)	0.027795734822283074
  (0, 4282)	0.020254183639864205
  (0, 4827)	0.029812871430277672
  (0, 1479)	0.02349975175760855
  (0, 3578)	0.020572702564291553
  (0, 2189)	0.02060619475383565
  (0, 1295)	0.028264926097956812
  (0, 4135)	0.017441826408691803
  :	:
  (24999, 1494)	0.06876971398869074
  (24999, 4087)	0.03497953491994339
  (24999, 4562)	0.042563559707579614
  (24999, 3402)	0.0302948277229394
  (24999, 614)	0.03792443066770522
  (24

In [5]:
features = vectorizer.get_feature_names()
features



[' ',
 ' a',
 ' aa',
 ' ab',
 ' ac',
 ' ad',
 ' ae',
 ' af',
 ' ag',
 ' ah',
 ' ai',
 ' ak',
 ' al',
 ' am',
 ' an',
 ' ap',
 ' ar',
 ' as',
 ' at',
 ' au',
 ' av',
 ' aw',
 ' ax',
 ' az',
 ' b',
 ' b ',
 ' ba',
 ' bb',
 ' be',
 ' bi',
 ' bl',
 ' bo',
 ' br',
 ' bu',
 ' by',
 ' c',
 ' c ',
 ' ca',
 ' ce',
 ' cg',
 ' ch',
 ' ci',
 ' cl',
 ' co',
 ' cr',
 ' cu',
 ' cy',
 ' d',
 ' da',
 ' de',
 ' di',
 ' do',
 ' dr',
 ' du',
 ' dv',
 ' dw',
 ' dy',
 ' e',
 ' e ',
 ' ea',
 ' eb',
 ' ec',
 ' ed',
 ' ee',
 ' ef',
 ' eg',
 ' ei',
 ' el',
 ' em',
 ' en',
 ' ep',
 ' eq',
 ' er',
 ' es',
 ' et',
 ' eu',
 ' ev',
 ' ex',
 ' ey',
 ' f',
 ' f ',
 ' fa',
 ' fb',
 ' fe',
 ' fi',
 ' fl',
 ' fo',
 ' fr',
 ' fu',
 ' fx',
 ' g',
 ' g ',
 ' ga',
 ' ge',
 ' gh',
 ' gi',
 ' gl',
 ' go',
 ' gr',
 ' gu',
 ' gw',
 ' gy',
 ' h',
 ' h ',
 ' ha',
 ' hb',
 ' he',
 ' hi',
 ' hm',
 ' ho',
 ' hu',
 ' hy',
 ' i',
 ' ia',
 ' ic',
 ' id',
 ' ig',
 ' ii',
 ' il',
 ' im',
 ' in',
 ' ir',
 ' is',
 ' it',
 ' iv',
 ' j',
 ' j

In [6]:
print(features)

[' ', ' a', ' aa', ' ab', ' ac', ' ad', ' ae', ' af', ' ag', ' ah', ' ai', ' ak', ' al', ' am', ' an', ' ap', ' ar', ' as', ' at', ' au', ' av', ' aw', ' ax', ' az', ' b', ' b ', ' ba', ' bb', ' be', ' bi', ' bl', ' bo', ' br', ' bu', ' by', ' c', ' c ', ' ca', ' ce', ' cg', ' ch', ' ci', ' cl', ' co', ' cr', ' cu', ' cy', ' d', ' da', ' de', ' di', ' do', ' dr', ' du', ' dv', ' dw', ' dy', ' e', ' e ', ' ea', ' eb', ' ec', ' ed', ' ee', ' ef', ' eg', ' ei', ' el', ' em', ' en', ' ep', ' eq', ' er', ' es', ' et', ' eu', ' ev', ' ex', ' ey', ' f', ' f ', ' fa', ' fb', ' fe', ' fi', ' fl', ' fo', ' fr', ' fu', ' fx', ' g', ' g ', ' ga', ' ge', ' gh', ' gi', ' gl', ' go', ' gr', ' gu', ' gw', ' gy', ' h', ' h ', ' ha', ' hb', ' he', ' hi', ' hm', ' ho', ' hu', ' hy', ' i', ' ia', ' ic', ' id', ' ig', ' ii', ' il', ' im', ' in', ' ir', ' is', ' it', ' iv', ' j', ' j ', ' ja', ' je', ' ji', ' jo', ' jr', ' ju', ' k', ' k ', ' ka', ' ke', ' kh', ' ki', ' kl', ' kn', ' ko', ' kr', ' ku', ' ky

In [8]:
print(y)

[1 1 0 ... 0 0 1]


In [9]:
x_train, x_eval, y_train, y_eval = train_test_split(
    x,y,test_size=TEST_SPLIT,random_state=RANDOM_SEED)

lgs = LogisticRegression(class_weight='balanced')
lgs.fit(x_train, y_train)

LogisticRegression(class_weight='balanced')

In [10]:
predicted = lgs.predict(x_eval)

print("Accuracy: %f" % lgs.score(x_eval, y_eval))

Accuracy: 0.859800


In [11]:
TEST_CLEAN_DATA = 'test_clean.csv'
test_data = pd.read_csv(DATA_IN_PATH+TEST_CLEAN_DATA)

testDataVecs = vectorizer.transform(test_data['review'])

test_predicted = lgs.predict(testDataVecs)
print(test_predicted)

[1 0 1 ... 0 1 0]


In [12]:
if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)
    
answer_dataset = pd.DataFrame({'id': test_data['id'],
                              'sentiment': test_predicted})
answer_dataset.to_csv(DATA_OUT_PATH+'lgs_tfidf_answer.csv',
                     index=False, quoting=3)

캐글 제출: 정확도 0.85..

### Q. 만약 "단어" 단위 tfidfVectorizer을 쓴다면?

In [18]:
""" 1. 모듈 import """
import os
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

""" 2. 데이터 불러오기 """
DATA_IN_PATH = "./data_in/"
DATA_OUT_PATH = "./data_out/"
TRAIN_CLEAN_DATA = 'train_clean.csv'

RANDOM_SEED = 42
TEST_SPLIT = 0.2

train_data = pd.read_csv(DATA_IN_PATH + TRAIN_CLEAN_DATA)

reviews = list(train_data['review'])
sentiments = list(train_data['sentiment'])

""" 3. feature Vectorizer 설정 """
vectorizer2 = TfidfVectorizer(min_df=0.0,        # 설정값보다 특정 토큰의 df값이 더 적게 나오면 벡터화 과정에서 해당 토큰 제거
                              analyzer='word',   # 분석 기준 단위 (word, char) """ 단어 기준 """
                              sublinear_tf=True, # 문서의 tf에 대한 스무딩 여부 설정
                              ngram_range=(1,3), # 빈도의 기본 단위를 어느 범위의 ngram으로 설정한 것인지
                              """ 1그램부터 3그램까지 """
                              max_features=5000) # 각 벡터의 최대 길이 (feature의 최대 개수)

""" 4. 설정한 Vectorizer로 학습데이터의 문장들 벡터화 """
x2 = vectorizer2.fit_transform(reviews)
y2 = np.array(sentiments)

""" 5. 벡터화 결과 feature 확인해보기 """
features2 = vectorizer2.get_feature_names()
print(features2) # 5000 길이

""" 6. 학습/검증셋 나누기 """
x_t, x_e, y_t, y_e = train_test_split(
    x2,y2,test_size=TEST_SPLIT,random_state=RANDOM_SEED)

""" 7. 모델 설정 """
lr = LogisticRegression(class_weight='balanced')

""" 8. 모델 학습 """
lr.fit(x_t, y_t) # 학습셋의 입력-타깃 쌍으로 모델학습(fit)

""" 9. 정확도 점검 """
print("Accuracy: %f" % lr.score(x_e, y_e)) # 검증셋으로 정확도 점검(score)
# 0.887200

""" 10. 사용한 Vectorizer로 테스트데이터 벡터화 """
testVecs = vectorizer2.transform(test_data['review'])

""" 11. 학습시킨 모델로 테스트데이터의 y(감정) 예측해보기 """
test_predicted2 = lr.predict(testVecs) # 테스트셋의 y 예측 

if not os.path.exists(DATA_OUT_PATH):
    os.makedirs(DATA_OUT_PATH)

""" 12. 예측 결과 export """
answer_dataset2 = pd.DataFrame({'id': test_data['id'],
                              'sentiment': test_predicted2})
answer_dataset2.to_csv(DATA_OUT_PATH+'word_based_lgs_tfidf_answer.csv',
                     index=False, quoting=3)





Accuracy: 0.887200


In [19]:
x2

<25000x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 2059645 stored elements in Compressed Sparse Row format>

In [20]:
print(x2)

  (0, 2382)	0.10265920981525471
  (0, 4165)	0.09648781898709254
  (0, 3245)	0.08900961411204703
  (0, 2890)	0.10281741872302302
  (0, 461)	0.09702882253370304
  (0, 2472)	0.09927300094484445
  (0, 4436)	0.09449253751598115
  (0, 3244)	0.10101641058149785
  (0, 4956)	0.08914245586089238
  (0, 2658)	0.09758908287059301
  (0, 1531)	0.16744664931947062
  (0, 2423)	0.08376977960766763
  (0, 2091)	0.06223430347368898
  (0, 4249)	0.06165362714714171
  (0, 1456)	0.06713752013339999
  (0, 1272)	0.05832117939548892
  (0, 1467)	0.04926635009884251
  (0, 1180)	0.10073530215035871
  (0, 355)	0.06432251810634183
  (0, 1113)	0.05589190302617524
  (0, 4255)	0.07398864809196035
  (0, 1793)	0.06474157534099313
  (0, 252)	0.0691948478739135
  (0, 4829)	0.05862639824033324
  (0, 3306)	0.08430017646386166
  :	:
  (24999, 1698)	0.1326711292609395
  (24999, 3791)	0.1455149613650999
  (24999, 2920)	0.16147812149822305
  (24999, 2915)	0.06813247153441376
  (24999, 1836)	0.22896926686306524
  (24999, 4791)	0.06

In [40]:
c = 0
for i in x2:
    print(type(i))
    print(i.shape)
    print(i)
    
    break
    
print(c)

<class 'scipy.sparse.csr.csr_matrix'>
(1, 5000)
  (0, 2382)	0.10265920981525471
  (0, 4165)	0.09648781898709254
  (0, 3245)	0.08900961411204703
  (0, 2890)	0.10281741872302302
  (0, 461)	0.09702882253370304
  (0, 2472)	0.09927300094484445
  (0, 4436)	0.09449253751598115
  (0, 3244)	0.10101641058149785
  (0, 4956)	0.08914245586089238
  (0, 2658)	0.09758908287059301
  (0, 1531)	0.16744664931947062
  (0, 2423)	0.08376977960766763
  (0, 2091)	0.06223430347368898
  (0, 4249)	0.06165362714714171
  (0, 1456)	0.06713752013339999
  (0, 1272)	0.05832117939548892
  (0, 1467)	0.04926635009884251
  (0, 1180)	0.10073530215035871
  (0, 355)	0.06432251810634183
  (0, 1113)	0.05589190302617524
  (0, 4255)	0.07398864809196035
  (0, 1793)	0.06474157534099313
  (0, 252)	0.0691948478739135
  (0, 4829)	0.05862639824033324
  (0, 3306)	0.08430017646386166
  :	:
  (0, 2212)	0.08160271418193847
  (0, 1947)	0.15771035639714986
  (0, 4852)	0.07014331053091216
  (0, 2783)	0.0570727665933063
  (0, 2641)	0.037859759

In [41]:
len(features)

5000

In [50]:
features2[2382]

'know people'

In [44]:
features2[4165]

'stay away'

In [45]:
features2[3245]

'people like'

In [46]:
features2[2890]

'movie people'

In [47]:
features2[461]

'bottom line'

In [51]:
features2[4245]

'stuff'

In [52]:
features2[1852]

'going'

In [53]:
features2[2819]

'moment'

In [54]:
features2[635]

'certain'

In [55]:
a  = """
stuff going moment mj started listening music watching odd documentary watched wiz watched moonwalker maybe want get certain insight guy thought really cool eighties maybe make mind whether guilty innocent moonwalker part biography part feature film remember going see cinema originally released subtle messages mj feeling towards press also obvious message drugs bad kay visually impressive course michael jackson unless remotely like mj anyway going hate find boring may call mj egotist consenting making movie mj fans would say made fans true really nice actual feature film bit finally starts minutes excluding smooth criminal sequence joe pesci convincing psychopathic powerful drug lord wants mj dead bad beyond mj overheard plans nah joe pesci character ranted wanted people know supplying drugs etc dunno maybe hates mj music lots cool things like mj turning car robot whole speed demon sequence also director must patience saint came filming kiddy bad sequence usually directors hate working one kid let alone whole bunch performing complex dance scene bottom line movie people like mj one level another think people stay away try give wholesome message ironically mj bestest buddy movie girl michael jackson truly one talented people ever grace planet guilty well attention gave subject hmmm well know people different behind closed doors know fact either extremely nice stupid guy one sickest liars hope latter
"""

print(len(a))
print(len(a.split()))

1423
219


In [56]:
features2[3306]

'planet'

In [57]:
for w in features2:
    if len(w.split()) == 3:
        print(w)

film ever seen
films ever seen
movie ever made
movie ever seen
movies ever seen
new york city
one worst movies
world war ii
worst film ever
worst movie ever
worst movies ever


In [58]:
'bit' in features2

True

In [59]:
'latter' in features2

True

In [60]:
for w in a.split():
    if w not in features2:
        print(w)

mj
wiz
moonwalker
moonwalker
biography
mj
kay
mj
mj
egotist
consenting
mj
excluding
pesci
psychopathic
mj
mj
overheard
nah
pesci
ranted
supplying
dunno
mj
mj
patience
saint
kiddy
mj
wholesome
mj
bestest
hmmm
closed
sickest
liars


5000개의 features에 담기지 않은 review 문장들의 단어들(1그램~3그램)은 뭘까? 왤까?