## stanford Large Movie Review Dataset 구글 검색

In [1]:
import pyprind
import pandas as pd
import os

In [4]:
basepath = 'aclImdb'

labels = {'pos': 1, 'neg': 0}
pbar = pyprind.ProgBar(50000)
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(basepath, s, l)
        for file in sorted(os.listdir(path)):
            with open(os.path.join(path, file), 
                      'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], 
                           ignore_index=True)
            pbar.update()
df.columns = ['review', 'sentiment']

AttributeError: 'DataFrame' object has no attribute 'append'

In [5]:
df

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
49995,"Towards the end of the movie, I felt it was to...",0
49996,This is the kind of movie that my enemies cont...,0
49997,I saw 'Descent' last night at the Stockholm Fi...,0
49998,Some films that you pick up for a pound turn o...,0


In [2]:
df.to_csv('movie_data.csv', index=False, encoding='utf-8')

NameError: name 'df' is not defined

In [6]:
# df = pd.read_csv('movie_data.csv')
# df

In [5]:
df = pd.read_csv('/Users/jjong/desktop/vscode/CWNU_ICT/movie_data.csv')
df

Unnamed: 0,review,sentiment
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1
...,...,...
49995,"Towards the end of the movie, I felt it was to...",0
49996,This is the kind of movie that my enemies cont...,0
49997,I saw 'Descent' last night at the Stockholm Fi...,0
49998,Some films that you pick up for a pound turn o...,0


# BoW (Bag of Word)
### 단어 등장 빈도정도 알아 볼 수 있다

In [8]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

sentence = ['The apple is delicious.', 'The banana is sweet.', 'One apple and two banana please.']
CntVec = CountVectorizer()
bag = CntVec.fit_transform(sentence)

In [9]:
CntVec.vocabulary_

{'the': 8,
 'apple': 1,
 'is': 4,
 'delicious': 3,
 'banana': 2,
 'sweet': 7,
 'one': 5,
 'and': 0,
 'two': 9,
 'please': 6}

In [13]:
tmp = bag.toarray()
tmp # TF의 행렬 문장은 3개, 단어는 총 10

array([[0, 1, 0, 1, 1, 0, 0, 0, 1, 0],
       [0, 0, 1, 0, 1, 0, 0, 1, 1, 0],
       [1, 1, 1, 0, 0, 1, 1, 0, 0, 1]], dtype=int64)

In [11]:
tmp.shape

(3, 10)

# 어떠한 단어가 중요한지 파악하기 위한
# TF-IDF (단어빈도 - 역문서빈도)

### TF(t, d) : 단어 t가 문서 d에 등장하는 횟수
### DF(t, d) : 단어 t가 포함된 문서 d의 수
### IDF(t, d) : n/(1+DF(t, d)) 0이 될 수 없게 하기 위해 +1
### TF-IDF : TF * IDF    다른 문서에는 잘 안 쓰이면서 현재 문서에서 많이 쓰인 단어

In [15]:
from sklearn.feature_extraction.text import TfidfTransformer

tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True)
tfidf.fit_transform(tmp).toarray() # 값이 크면 클수록 중요

array([[0.        , 0.45985353, 0.        , 0.60465213, 0.45985353,
        0.        , 0.        , 0.        , 0.45985353, 0.        ],
       [0.        , 0.        , 0.45985353, 0.        , 0.45985353,
        0.        , 0.        , 0.60465213, 0.45985353, 0.        ],
       [0.44036207, 0.3349067 , 0.3349067 , 0.        , 0.        ,
        0.44036207, 0.44036207, 0.        , 0.        , 0.44036207]])

### 위의 상태를 머신러닝 분류기로 긍정적 혹은 부정적인지 판별

In [16]:
X_train = df.loc[:25000, 'review'].values
y_train = df.loc[:25000, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [20]:
# 위의 두 번의 과정을 한 번에 하는 함수
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


tfidf = TfidfVectorizer(strip_accents=None, lowercase=False, preprocessor=None, stop_words=None, ngram_range=(1,1))
tfidf_lr = Pipeline([('tfidf',tfidf), ('lr', LogisticRegression(solver='liblinear', C=10, penalty='l2', random_state=0))])

In [21]:
tfidf_lr.fit(X_train, y_train)

In [22]:
p_test = tfidf_lr.predict(X_test)
p_test

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [31]:
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

print(f1_score(y_test, p_test))
print(accuracy_score(y_test, p_test) * 100,'%')
print(confusion_matrix(y_test, p_test))

0.878549214427077
87.972 %
[[11117  1383]
 [ 1624 10876]]
