In [41]:
import pandas as pd
import numpy as np

## データ前処理

In [42]:
#生データ
df_all = pd.read_table("reviews_smaller")
df_all.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date
0,JP,38795574,RSZAJL84SRIAS,B004GFWRMY,900674814,Mobee The Magic Charger Wireless Charger for M...,PC,4,6,9,N,N,便利です！,MagicMouseは電池の持ちがあまり良くないので、画期的な商品と思い発売されてすぐ購入し...,2011-01-07
1,JP,8953696,RLOQPKXQ53WV9,B00F27CN92,178375143,ガーディアンズ 伝説の勇者たち [DVD],Video DVD,3,1,3,N,Y,作品は素晴らしいです,ブルーレイで出してほしいなってことで☆３つ<br />サンドマンのキラキラをぜひブルーレイで...,2014-10-18
2,JP,37479431,R3CYTOEWWCET7G,B00005QG9J,550594085,Morning View,Music,5,8,19,N,N,偏屈ものを魅せたふわふわパーマ,インキュバスと出会ったのがこのアルバムっていう人は多いと思います。僕もその一人です。 最初に...,2005-02-06
3,JP,35816213,R2S2P41DDTG3AX,B000K4X2UQ,336835114,Live in the Heart of the City,Music,3,1,4,N,Y,ロックだぜ！！,初期の曲はあまり知らないけど、世の中の人にしっかり初期の曲を聴いていただきたい！！<br /...,2015-03-22
4,JP,6774328,ROK4PFNN9GJWF,B00U22SZS0,262367368,10:00 P.M. - 11:00 A.M.,Digital_Video_Download,5,3,20,N,Y,・,オードリーが殺されたのがショック…<br /><br />今度はロシアに連れて行かれてエンデ...,2015-03-08


In [43]:
print(df_all.shape)

(10000, 15)


In [44]:
#POSitive
df_pos = df_all[df_all["star_rating"] == 5]
df_pos["label"] = "pos"

#NEGative
df_neg = df_all[df_all["star_rating"] == 1]
df_neg["label"] = "neg"

#POSをNEGと同数になるまで減らす
df_pos = df_pos.sample(df_neg.shape[0], random_state=334)

#pos + neg
df_mix = pd.concat([df_pos, df_neg])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_pos["label"] = "pos"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_neg["label"] = "neg"


In [45]:
#学習用とテスト用にわける
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df_mix, test_size=0.15, random_state=334)
train_df = train_df.reset_index(drop=True)
valid_df = valid_df.reset_index(drop=True)

In [46]:
#分布
print(train_df["label"].value_counts() / train_df.shape[0])

neg    0.513453
pos    0.486547
Name: label, dtype: float64


In [47]:
#ラベル（pos, neg)
y_train = train_df["label"]
y_valid = valid_df["label"]

y_train.shape, y_valid.shape

((892,), (158,))

### TFIDFを計算

In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from MeCab import Tagger

In [49]:
def tokenize(text):
    tagger = Tagger("-O wakati")
    return tagger.parse(text)

In [50]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range = (1,2 ), min_df=10, sublinear_tf=True)
tfidf_fit = vectorizer.fit(df_all["review_body"])
text_all = tfidf_fit.transform(df_all["review_body"])
text_train = tfidf_fit.transform(train_df["review_body"])
text_valid = tfidf_fit.transform(valid_df["review_body"])



## ロジスティック回帰

### 学習

In [51]:
from sklearn.linear_model import LogisticRegression

In [52]:
x_train = text_train.toarray()
x_valid = text_valid.toarray()

x_train.shape, x_valid.shape

((892, 12900), (158, 12900))

In [53]:
model = LogisticRegression(C=2, penalty="l2", solver="liblinear", dual=False, multi_class="ovr")
model.fit(x_train,y_train)
model.score(x_valid,y_valid)

0.8227848101265823

### 予測

In [54]:
label_all = model.predict(text_all)

In [55]:
df_all["predicted_label"] = label_all

df_all.head()

Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,predicted_label
0,JP,38795574,RSZAJL84SRIAS,B004GFWRMY,900674814,Mobee The Magic Charger Wireless Charger for M...,PC,4,6,9,N,N,便利です！,MagicMouseは電池の持ちがあまり良くないので、画期的な商品と思い発売されてすぐ購入し...,2011-01-07,neg
1,JP,8953696,RLOQPKXQ53WV9,B00F27CN92,178375143,ガーディアンズ 伝説の勇者たち [DVD],Video DVD,3,1,3,N,Y,作品は素晴らしいです,ブルーレイで出してほしいなってことで☆３つ<br />サンドマンのキラキラをぜひブルーレイで...,2014-10-18,neg
2,JP,37479431,R3CYTOEWWCET7G,B00005QG9J,550594085,Morning View,Music,5,8,19,N,N,偏屈ものを魅せたふわふわパーマ,インキュバスと出会ったのがこのアルバムっていう人は多いと思います。僕もその一人です。 最初に...,2005-02-06,pos
3,JP,35816213,R2S2P41DDTG3AX,B000K4X2UQ,336835114,Live in the Heart of the City,Music,3,1,4,N,Y,ロックだぜ！！,初期の曲はあまり知らないけど、世の中の人にしっかり初期の曲を聴いていただきたい！！<br /...,2015-03-22,pos
4,JP,6774328,ROK4PFNN9GJWF,B00U22SZS0,262367368,10:00 P.M. - 11:00 A.M.,Digital_Video_Download,5,3,20,N,Y,・,オードリーが殺されたのがショック…<br /><br />今度はロシアに連れて行かれてエンデ...,2015-03-08,neg


In [56]:
def get_ratio(df, stars_n, sentiment):
    return df[(df["predicted_label"]==sentiment) & (df["star_rating"]==stars_n)].shape[0] / df[df["star_rating"]==stars_n].shape[0]

result = pd.DataFrame(
    data = [[get_ratio(df_all, cnt, "pos"), get_ratio(df_all, cnt, "neg")]
                for cnt in range(1, 6)],
    index = [str(cnt)+"-stars" for cnt in range(1,6)],
    columns = ["pos", "neg"]
)

result

Unnamed: 0,pos,neg
1-stars,0.04,0.96
2-stars,0.294118,0.705882
3-stars,0.449102,0.550898
4-stars,0.661687,0.338313
5-stars,0.796445,0.203555
