## 데이터 불러오기

In [7]:
import pandas as pd

In [8]:
df = pd.read_csv("Documents/kaggle/amazon/Reviews.csv")

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 568454 entries, 0 to 568453
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype 
---  ------                  --------------   ----- 
 0   Id                      568454 non-null  int64 
 1   ProductId               568454 non-null  object
 2   UserId                  568454 non-null  object
 3   ProfileName             568438 non-null  object
 4   HelpfulnessNumerator    568454 non-null  int64 
 5   HelpfulnessDenominator  568454 non-null  int64 
 6   Score                   568454 non-null  int64 
 7   Time                    568454 non-null  int64 
 8   Summary                 568427 non-null  object
 9   Text                    568454 non-null  object
dtypes: int64(5), object(5)
memory usage: 43.4+ MB


## 데이터 전처리

### Sentiment 생성

In [10]:
# 4,5점은 1로 분류, 1,2점은 0으로 분류, 그밖에 3은 'middle'로 분류 (제거 예정)
df['Sentiment'] = df['Score'].apply(lambda score: "1" if score > 3 else("0" if score < 3 else "middle"))

In [11]:
# 개수 확인
df['Sentiment'].value_counts()

1         443777
0          82037
middle     42640
Name: Sentiment, dtype: int64

In [12]:
# 3점은 제거
df_index = df[df['Sentiment']=='middle'].index
df = df.drop(df_index)

In [13]:
#  재확인
df['Sentiment'].value_counts()

1    443777
0     82037
Name: Sentiment, dtype: int64

### Null값

In [14]:
# Null값 확인
df.isnull().sum()

Id                         0
ProductId                  0
UserId                     0
ProfileName               16
HelpfulnessNumerator       0
HelpfulnessDenominator     0
Score                      0
Time                       0
Summary                   25
Text                       0
Sentiment                  0
dtype: int64

In [15]:
# Null값 제거
df = df.dropna(axis=0)

In [16]:
# 재확인
df.isnull().sum()

Id                        0
ProductId                 0
UserId                    0
ProfileName               0
HelpfulnessNumerator      0
HelpfulnessDenominator    0
Score                     0
Time                      0
Summary                   0
Text                      0
Sentiment                 0
dtype: int64

### Sentiment 와 Text로 df1 생성

In [17]:
# Sentiment 와 Text만 따로 분리하여 df1 생성
# Sentiment ; 긍부정 여부(1 or 2)
# Text : 리뷰
# Summary도 있으나 더 많은 데이터 수집을 위해 Text를 사용
df1 = df.loc[:,['Sentiment', 'Text']]

In [18]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 525773 entries, 0 to 568453
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Sentiment  525773 non-null  object
 1   Text       525773 non-null  object
dtypes: object(2)
memory usage: 12.0+ MB


In [19]:
#분류 확인
df1.head()

Unnamed: 0,Sentiment,Text
0,1,I have bought several of the Vitality canned d...
1,0,Product arrived labeled as Jumbo Salted Peanut...
2,1,This is a confection that has been around a fe...
3,0,If you are looking for the secret ingredient i...
4,1,Great taffy at a great price. There was a wid...


### 특수문자, 기호 등 제거

In [20]:
#원문 확인
df1[:10]

Unnamed: 0,Sentiment,Text
0,1,I have bought several of the Vitality canned d...
1,0,Product arrived labeled as Jumbo Salted Peanut...
2,1,This is a confection that has been around a fe...
3,0,If you are looking for the secret ingredient i...
4,1,Great taffy at a great price. There was a wid...
5,1,I got a wild hair for taffy and ordered this f...
6,1,This saltwater taffy had great flavors and was...
7,1,This taffy is so good. It is very soft and ch...
8,1,Right now I'm mostly just sprouting this so my...
9,1,This is a very healthy dog food. Good for thei...


In [21]:
import re
from multiprocessing import Pool
import pandas as pd
import re
import time
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
from matplotlib import rc
import seaborn as sns

#### 특수문자 등 제거
- 특수문자를 제거했다고 해서 더 나아지거나 하지 않음(스킵 가능)

In [22]:
df1["Text"] = df1["Text"].str.replace(pat=r'[^\w]', repl=r' ', regex=True)

In [23]:
#제거 확인
df1[:50]

Unnamed: 0,Sentiment,Text
0,1,I have bought several of the Vitality canned d...
1,0,Product arrived labeled as Jumbo Salted Peanut...
2,1,This is a confection that has been around a fe...
3,0,If you are looking for the secret ingredient i...
4,1,Great taffy at a great price There was a wid...
5,1,I got a wild hair for taffy and ordered this f...
6,1,This saltwater taffy had great flavors and was...
7,1,This taffy is so good It is very soft and ch...
8,1,Right now I m mostly just sprouting this so my...
9,1,This is a very healthy dog food Good for thei...


#### 소문자로 변환

In [24]:
df1['Text'] = df1['Text'].str.lower()

In [25]:
#변환 확인
df1[:10]

Unnamed: 0,Sentiment,Text
0,1,i have bought several of the vitality canned d...
1,0,product arrived labeled as jumbo salted peanut...
2,1,this is a confection that has been around a fe...
3,0,if you are looking for the secret ingredient i...
4,1,great taffy at a great price there was a wid...
5,1,i got a wild hair for taffy and ordered this f...
6,1,this saltwater taffy had great flavors and was...
7,1,this taffy is so good it is very soft and ch...
8,1,right now i m mostly just sprouting this so my...
9,1,this is a very healthy dog food good for thei...


#### 불용어 : stopwords
- 시간이 아주 오래 걸림 : 10만개 추출시 1시간 이상 소요

In [26]:
#stopword엔 179개의 불용어가 존재함
stop_words_list = stopwords.words('english')
print('불용어 개수 :', len(stop_words_list))
print('불용어 10개 출력 :',stop_words_list[:10])

불용어 개수 : 179
불용어 10개 출력 : ['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [27]:
import nltk
from nltk.corpus import stopwords

In [28]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

In [29]:
def preprocess(x):
    x = [w for w in x.split() if w not in set(stopwords.words('english'))]  # remove stopwords
    return ' '.join(x)                                     # join the list

In [30]:
# 불용어 제거시 시간소요가 너무 많아서 10만개 이하로만 진행
# df2로 생성
df2 = df1[:100000]

In [31]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 108632
Data columns (total 2 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Sentiment  100000 non-null  object
 1   Text       100000 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [32]:
df2['Text'] = df2['Text'].apply(preprocess)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Text'] = df2['Text'].apply(preprocess)


In [33]:
df2[:10]

Unnamed: 0,Sentiment,Text
0,1,bought several vitality canned dog food produc...
1,0,product arrived labeled jumbo salted peanuts p...
2,1,confection around centuries light pillowy citr...
3,0,looking secret ingredient robitussin believe f...
4,1,great taffy great price wide assortment yummy ...
5,1,got wild hair taffy ordered five pound bag taf...
6,1,saltwater taffy great flavors soft chewy candy...
7,1,taffy good soft chewy flavors amazing would de...
8,1,right mostly sprouting cats eat grass love rot...
9,1,healthy dog food good digestion also good smal...


## 문장에서 단어들 분리
- 소문자 변환
- 특수문자와 불용어는 있는 상태로 진행

### NLTK, word stemming

In [34]:
# NLTK : Natural Language Toolkit(자연어 처리를 위한 다양한 API를 제공하고 있는 라이브러리)

In [35]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

porter = PorterStemmer()
stop = stopwords.words('english')

def tokenizer(text):
    return text.split()
def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

In [36]:
# 단어 분리하는 기법 2개 비교
text = 'The runner loves running and thus they run'
print(tokenizer(text))   # 단어 공백 기준으로 분리함
print(tokenizer_porter(text))   # 단어를 공백으로 분리 후 원형으로 분리하여 대체
                                # running -> run, loves-> love 등

['The', 'runner', 'loves', 'running', 'and', 'thus', 'they', 'run']
['the', 'runner', 'love', 'run', 'and', 'thu', 'they', 'run']


In [37]:
# 여기선 Word stemming 사용 (단어의 원형으로 대체하여 단어들을 분리)

In [38]:
from tokenize import tokenize, untokenize, NUMBER, STRING, NAME, OP

## 머신러닝 적용

In [39]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
import pickle
import os
from time import time

In [40]:
# 임의로 10만개의 데이터 中 1:3 비율로 train, test 데이터 분리

In [41]:
X_train = df1.loc[:150000, 'Text'].values
y_train = df1.loc[:150000, 'Sentiment'].values
X_test = df1.loc[:50000, 'Text'].values
y_test = df1.loc[:50000, 'Sentiment'].values

In [42]:
# CountVectorizer : 텍스트에서 단위별 등장횟수를 카운팅하여 수치벡터화
#                   -> 의미 없이 자주 사용되는 단어의 가중치의 증가
# TfidfVectorizer : TF-IDF라는 값을 사용하여 CountVectorizer의 단점을 보완함(간단한 수식 존재함)

In [43]:
tfidf = TfidfVectorizer(tokenizer= tokenizer_porter)

In [44]:
# pipeline의 역할 : 여러가지 전처리한 모델들을 한데 묶어서 fit

In [45]:
# 가중치의 절대값에 비례하는 penalty 추가 : l1 규제
# 가중치의 제곱에 비례하는 penalty 추가 : l2 규제

### LogisticRegression

In [61]:
Ir_tfidf = Pipeline([('vect',tfidf),('clf',LogisticRegression(C=10.0, penalty='l2',random_state=0))])

In [62]:
Ir_tfidf.fit(X_train, y_train)
y_pred = Ir_tfidf.predict(X_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [63]:
print('정확도: %.3f' %accuracy_score(y_test, y_pred))

정확도: 0.951


## 예측기 생성

In [76]:
label = {0:'부정적 의견', 1:'긍정적 의견'}

In [77]:
import numpy as np

In [None]:
label = {0:'부정적 의견', 1:'긍정적 의견'}
print('예측: 1(긍정적인 의견)')
print('예측: 0(부정적인 의견)')
print('')
while True:
    txt = input('영문으로 리뷰를 작성하세요: ')
    if txt =='':
        break
    example = [txt]
    print("예측: %s\n확률: %.3f%%" %(Ir_tfidf.predict(example)[0], np.max(Ir_tfidf.predict_proba(example))*100))

예측: 1(긍정적인 의견)
예측: 0(부정적인 의견)

영문으로 리뷰를 작성하세요: It was fast and kind.
예측: 1
확률: 98.816%
영문으로 리뷰를 작성하세요: It was terrible
예측: 0
확률: 99.990%
영문으로 리뷰를 작성하세요: This taffy is so good.
예측: 1
확률: 69.141%
영문으로 리뷰를 작성하세요: This offer is a great price and a great taste.
예측: 1
확률: 100.000%
영문으로 리뷰를 작성하세요: I'm unsatisfactory your parcel service.
예측: 0
확률: 72.070%
영문으로 리뷰를 작성하세요: 마음에 들지 않아요
예측: 1
확률: 84.102%
영문으로 리뷰를 작성하세요: I won't buy it again!
예측: 1
확률: 84.238%
영문으로 리뷰를 작성하세요: I don't like it
예측: 1
확률: 97.402%
