# 데이터 불러오기

## 원자료

In [1]:
import csv

In [2]:
with open('naver_review.csv', encoding='utf8') as f:
    w = csv.reader(f)
    next(w)
    reviews = list(w)

## TDM

In [14]:
import numpy

In [15]:
tdm = numpy.load('tdm.npy').tolist()

## 단어 목록

In [16]:
with open('nouns.txt', encoding='utf8') as f:
    noun_list = f.read().splitlines()

# training set / test set

In [18]:
from sklearn.cross_validation import train_test_split

In [17]:
stars = [int(r[0]) for r in reviews]

In [19]:
X_train, X_test, y_train, y_test = train_test_split(tdm, stars, test_size=0.2, random_state=42)

# Linear Model

In [21]:
from sklearn import linear_model

In [22]:
lm = linear_model.LinearRegression()

In [23]:
lm.fit (X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

## 결과 보기

In [24]:
import operator

In [25]:
def get_important_words(model, positive=True, n=8):
    return sorted(list(zip(noun_list, model.coef_)), key=operator.itemgetter(1), reverse=positive)[:n]

In [26]:
get_important_words(lm)

[('출연', 21.126843170480907),
 ('등장', 19.235296744786069),
 ('월광', 18.122933156886671),
 ('성도', 17.896875110404213),
 ('중심', 16.028111881973231),
 ('비교', 15.609078320947066),
 ('혼돈', 14.20742455763156),
 ('지릴뻔', 13.188010111982845)]

In [27]:
get_important_words(lm, False)

[('정체', -27.125871001383075),
 ('회수', -23.626443407381071),
 ('퀄리티', -19.124024482209162),
 ('흐트러진', -18.69698843098967),
 ('차지', -16.799749282869072),
 ('사탄', -15.082051508002426),
 ('당신', -14.072245447042762),
 ('개도', -13.796548352700984)]

In [28]:
lm.score(X_train, y_train)

0.62483192619025119

In [29]:
lm.score(X_test, y_test)

-1.9796774476682695

# Lasso regression

In [30]:
lasso = linear_model.Lasso(alpha=0.01)

In [31]:
lasso.fit (X_train, y_train)

Lasso(alpha=0.01, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=None,
   selection='cyclic', tol=0.0001, warm_start=False)

In [33]:
get_important_words(lasso)

[('현혹', 0.88713581696690991),
 ('소름', 0.84968514539862283),
 ('한국', 0.70021990178298676),
 ('완전', 0.67825442406668324),
 ('최고', 0.60862345479841662),
 ('한번', 0.60141433009348277),
 ('대박', 0.58340521981655247),
 ('나홍진', 0.54710065882888215)]

In [34]:
get_important_words(lasso, False)

[('쓰레기', -3.0157856495097417),
 ('최악', -2.8314911049859814),
 ('실망', -2.3545628264857945),
 ('진심', -2.1289169617709827),
 ('별로', -2.1019194392410894),
 ('평론가', -1.6022189468578882),
 ('스트레스', -1.5782762715387715),
 ('노잼', -1.4914868801606267)]

In [35]:
lasso.score(X_train, y_train)

0.22711072765475204

In [36]:
lasso.score(X_test, y_test)

0.16421563673695438

# Ridge Regression

In [37]:
ridge = linear_model.Ridge(alpha=10)

In [38]:
ridge.fit (X_train, y_train)

Ridge(alpha=10, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [39]:
get_important_words(ridge)

[('현혹', 1.0606360313575611),
 ('꿀잼', 1.0450710961021665),
 ('한국', 1.0184225079918061),
 ('완전', 0.92786384663445409),
 ('대박', 0.90258817555445758),
 ('소름', 0.8792755658565542),
 ('상영', 0.85843551105874594),
 ('오랜만', 0.78746575209006453)]

In [40]:
get_important_words(ridge, False)

[('최악', -2.1841099813407854),
 ('쓰레기', -2.1790751229751617),
 ('별로', -2.0450722209021839),
 ('실망', -1.9439436345422618),
 ('진심', -1.7678527031577969),
 ('노잼', -1.6406193126396249),
 ('스트레스', -1.6050366600736174),
 ('평론가', -1.5384221738750758)]

In [41]:
ridge.score(X_train, y_train)

0.39118958807486759

In [42]:
ridge.score(X_test, y_test)

0.18769503917159325

# LassoCV

In [43]:
lassocv = linear_model.LassoCV()

In [44]:
lassocv.fit (X_train, y_train)

LassoCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
    max_iter=1000, n_alphas=100, n_jobs=1, normalize=False, positive=False,
    precompute='auto', random_state=None, selection='cyclic', tol=0.0001,
    verbose=False)

In [45]:
lassocv.alpha_

0.0044042722467688596

In [46]:
get_important_words(lassocv)

[('현혹', 1.1542006907606654),
 ('꿀잼', 1.0870515552945328),
 ('한국', 1.0249185904484286),
 ('완전', 0.98426458718318577),
 ('대박', 0.92037409166312578),
 ('소름', 0.87429295078965152),
 ('상영', 0.73455912085431208),
 ('오랜만', 0.70759770353490214)]

In [47]:
get_important_words(lassocv, False)

[('쓰레기', -4.3859818525450258),
 ('최악', -3.715644334789411),
 ('예수', -3.5464908051029314),
 ('진심', -2.9211490020643791),
 ('실망', -2.8420014535860054),
 ('평론가', -2.7031547321076626),
 ('점도', -2.6163250357422703),
 ('페이', -2.3182003406731408)]

In [35]:
lassocv.score(X_train, y_train)

0.32831600665376703

In [36]:
lassocv.score(X_test, y_test)

0.16900825071460457

# RidgeCV

In [48]:
ridgecv = linear_model.RidgeCV(alphas=[.001, .01, .1, 1, 10, 100])

In [49]:
ridgecv.fit (X_train, y_train)

RidgeCV(alphas=[0.001, 0.01, 0.1, 1, 10, 100], cv=None, fit_intercept=True,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

In [50]:
ridgecv.alpha_

10.0

In [51]:
get_important_words(ridgecv)

[('현혹', 1.0854353691558585),
 ('꿀잼', 1.0710396336242118),
 ('한국', 1.0355944863167785),
 ('완전', 0.9481105798340711),
 ('대박', 0.91198377663873509),
 ('소름', 0.90322135536273074),
 ('상영', 0.85590110552366339),
 ('오랜만', 0.8013713597294102)]

In [52]:
get_important_words(ridgecv, False)

[('최악', -2.1723033264913294),
 ('쓰레기', -2.1690158639572172),
 ('별로', -2.02095203786768),
 ('실망', -1.9358181725786328),
 ('진심', -1.7500396804144158),
 ('노잼', -1.6294530240950911),
 ('스트레스', -1.6051898158445224),
 ('평론가', -1.5355451213566105)]

In [53]:
ridgecv.score(X_train, y_train)

0.39119389928724335

In [54]:
ridgecv.score(X_test, y_test)

0.1886052761888527

# Elastic Net

In [44]:
elastic = linear_model.ElasticNetCV(l1_ratio=numpy.arange(.1, 1.0, .1))

In [46]:
elastic.fit(X_train, y_train)

ElasticNetCV(alphas=None, copy_X=True, cv=None, eps=0.001, fit_intercept=True,
       l1_ratio=array([ 0.1,  0.2,  0.3,  0.4,  0.5,  0.6,  0.7,  0.8,  0.9]),
       max_iter=1000, n_alphas=100, n_jobs=1, normalize=False,
       positive=False, precompute='auto', random_state=None,
       selection='cyclic', tol=0.0001, verbose=0)

In [47]:
elastic.alpha_

0.0041454505711805917

In [48]:
elastic.l1_ratio_

0.40000000000000002

In [49]:
get_important_words(elastic)

[('현혹', 1.1417954841638336),
 ('꿀잼', 1.1078209495543085),
 ('한국', 1.0526229849609217),
 ('완전', 0.98976314958602352),
 ('대박', 0.95971897262172479),
 ('상영', 0.9156708687214834),
 ('소름', 0.88040995609757122),
 ('오랜만', 0.8061532529447677)]

In [50]:
get_important_words(elastic, False)

[('쓰레기', -2.9382010752555678),
 ('최악', -2.7683799989100377),
 ('실망', -2.3606284276357599),
 ('진심', -2.2018255403600566),
 ('별로', -2.1629749288301507),
 ('평론가', -1.9633450718613386),
 ('스트레스', -1.8238572339697352),
 ('노잼', -1.7803492039589577)]

In [51]:
elastic.score(X_train, y_train)

0.37185960289666742

In [52]:
elastic.score(X_test, y_test)

0.18370877610542646

# 읽을 거리

더 자세한 내용은 scikit-learn 홈페이지의 [Supervised Learning](http://scikit-learn.org/stable/supervised_learning.html) 참조