In [1]:
import pandas as pd

### 데이터 로드

In [3]:
review_df=pd.read_csv("./data/word2vec-nlp-tutorial/labeledTrainData.tsv",header=0,sep="\t",quoting=3)
review_df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


- id : 각 데이터의 id
- sentiment : 영화평(review)의 Sentiment 결과 값(Target Label). 1은 긍정, 0은 부정
- review : 영화평 텍스트

#### 첫번째 영화평 확인

In [10]:
review_df['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally

- br 태그가 보임(삭제 필요)

### 데이터 전처리

```
 <br /> 태그 제거
```

In [12]:
review_df['review']=review_df['review'].str.replace('<br />','')
review_df['review'][0]

'"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ\'s feeling towards the press and also the obvious message of drugs are bad m\'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is only on for 2

#### 영어 문자열이 아닌 문자는 모두 공백으로 처리

In [13]:
import re
review_df['review']=review_df['review'].apply(lambda x:re.sub("[^a-zA-Z]"," ",x))

In [14]:
review_df['review'][0]

'WithallthisstuffgoingdownatthemomentwithMJivestartedlisteningtohismusicwatchingtheodddocumentaryhereandtherewatchedTheWizandwatchedMoonwalkeragainMaybeijustwanttogetacertaininsightintothisguywhoithoughtwasreallycoolintheeightiesjusttomaybemakeupmymindwhetherheisguiltyorinnocentMoonwalkerispartbiographypartfeaturefilmwhichiremembergoingtoseeatthecinemawhenitwasoriginallyreleasedSomeofithassubtlemessagesaboutMJsfeelingtowardsthepressandalsotheobviousmessageofdrugsarebadmkayVisuallyimpressivebutofcoursethisisallaboutMichaelJacksonsounlessyouremotelylikeMJinanywaythenyouaregoingtohatethisandfinditboringSomemaycallMJanegotistforconsentingtothemakingofthismovieBUTMJandmostofhisfanswouldsaythathemadeitforthefanswhichiftrueisreallyniceofhimTheactualfeaturefilmbitwhenitfinallystartsisonlyonforminutesorsoexcludingtheSmoothCriminalsequenceandJoePesciisconvincingasapsychopathicallpowerfuldruglordWhyhewantsMJdeadsobadisbeyondmeBecauseMJoverheardhisplansNahJoePescischaracterrantedthathewantedpeople

### 데이터 세트 및 피처 데이터 세트 생성

In [16]:
from sklearn.model_selection import train_test_split

# 레이블 추출
class_df = review_df['sentiment']

# 리뷰만 추출하여 feature 데이터 세트 생성
feature_df = review_df.drop(['id','sentiment'],axis=1,inplace=False)

In [17]:
X_train,X_test,y_train,y_test = train_test_split(feature_df,class_df,test_size=0.3,random_state=156)

X_train.shape,X_test.shape

((17500, 1), (7500, 1))

### 예측 성능

In [18]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,roc_auc_score

In [19]:
# 스톱 워드는 English, filtering, ngram은 (1,2)로 설정해 CountVectorization 수행

pipeline = Pipeline([
    ('cnt_vect',CountVectorizer(stop_words='english',ngram_range=(1,2))),
    ('lr_clf',LogisticRegression(C=10))
])

# Pipeline 객체를 이용해 fit(), predict()로 학습/예측 수행, predict_proba()는 roc_auc 때문에 수행
pipeline.fit(X_train['review'],y_train)
pred = pipeline.predict(X_test['review'])
pred_probs = pipeline.predict_proba(X_test['review'])[:,1]

print('예측 정확도는 {0: .4f}, ROC-AUC는 {1:.4f}'.format(accuracy_score(y_test,pred),roc_auc_score(y_test,pred_probs)))

예측 정확도는  0.4919, ROC-AUC는 0.5054
