In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [2]:
imdb_data = pd.read_csv('IMDB Dataset.csv')
print(imdb_data.shape)
imdb_data.head()

(50000, 2)


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [3]:
imdb_data.describe() # 데이터셋 요약

Unnamed: 0,review,sentiment
count,50000,50000
unique,49582,2
top,Loved today's show!!! It was a variety and not...,negative
freq,5,25000


In [4]:
imdb_data['sentiment'].value_counts() # 문장 수 count

negative    25000
positive    25000
Name: sentiment, dtype: int64

In [5]:
# train, test 데이터셋 생성
train_reviews = imdb_data.review[:40000]
train_sentiments = imdb_data.sentiment[:40000]
test_reviews = imdb_data.review[40000:]
test_sentiments = imdb_data.sentiment[40000:]

print('-----train dataset-----')
print(train_reviews.shape)
print(train_sentiments.shape)
print('-----test dataset-----')
print(test_reviews.shape)
print(test_sentiments.shape)

-----train dataset-----
(40000,)
(40000,)
-----test dataset-----
(10000,)
(10000,)


In [6]:
# 텍스트 정규화
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

tokenizer = ToktokTokenizer()
stopword_list= nltk.corpus.stopwords.words('english')

In [7]:
# html 및 노이즈 텍스트 제거
from bs4 import BeautifulSoup
import re

def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()

def remove_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)

def denoise_text(text):
    text = strip_html(text)
    text = remove_square_brackets(text)
    return text

In [8]:
imdb_data['review'] = imdb_data['review'].apply(denoise_text)
imdb_data['review']

0        One of the other reviewers has mentioned that ...
1        A wonderful little production. The filming tec...
2        I thought this was a wonderful way to spend ti...
3        Basically there's a family where a little boy ...
4        Petter Mattei's "Love in the Time of Money" is...
                               ...                        
49995    I thought this movie did a down right good job...
49996    Bad plot, bad dialogue, bad acting, idiotic di...
49997    I am a Catholic taught in parochial elementary...
49998    I'm going to have to disagree with the previou...
49999    No one expects the Star Trek movies to be high...
Name: review, Length: 50000, dtype: object

In [9]:
# 특수문자 제거
def remove_special_characters(text, remove_digits=True):
    pattern = r'[^a-zA-z0-9\s]'
    text = re.sub(pattern,'',text)
    return text

imdb_data['review'] = imdb_data['review'].apply(remove_special_characters)

In [10]:
#어간 추출

from nltk.stem.porter import PorterStemmer

def simple_stemmer(text):
    ps = nltk.porter.PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    return text

imdb_data['review'] = imdb_data['review'].apply(simple_stemmer)

In [11]:
# 영어 불용어 제거

stop=set(stopwords.words('english'))
print(stop)

def remove_stopwords(text, is_lower_case=False):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    if is_lower_case:
        filtered_tokens = [token for token in tokens if token not in stopword_list]
    else:
        filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]
    filtered_text = " ".join(filtered_tokens)    
    return filtered_text

imdb_data['review']=imdb_data['review'].apply(remove_stopwords)

{"you're", 'both', 'our', 'does', 'been', 'on', 'for', 'd', 'nor', 'these', 'hadn', 'more', 'there', 'after', 'yourselves', 'don', 'hasn', 'theirs', 'how', "shan't", 'by', 'over', 're', 'which', 'shouldn', 'out', 'in', 'too', 'have', 'doesn', 'an', "won't", 'if', 'myself', 'needn', 'y', 'what', 'doing', 'below', 'further', 'until', 'at', "you'd", 'had', 'you', 'herself', 's', "wasn't", 'her', "haven't", 'when', 'haven', 'during', 'couldn', "mightn't", 'them', 'not', 'hers', "should've", 'into', 'me', 'why', 'such', "isn't", 'should', 'and', 'of', 'no', 't', "that'll", 'where', 'ain', "hadn't", 'off', 'yours', 'few', 'having', 'with', 'yourself', 'am', 'through', 'most', 'isn', 'ourselves', "shouldn't", 'those', 'do', 'some', 'your', 'as', 'won', 'being', 'mightn', 'did', 'each', 'or', 'other', 'own', 'ours', 'whom', 'is', 'to', 'here', "it's", 'while', 'will', 'a', 'so', 'be', 'up', 'were', 'can', 'we', 'my', 'any', 'i', 'above', "couldn't", 'didn', 'now', 'before', "weren't", "you've"

In [12]:
# train review 정규화

train_reviews = imdb_data.review[:40000]
train_reviews[0]


'one review ha mention watch 1 Oz episod youll hook right thi exactli happen meth first thing struck Oz wa brutal unflinch scene violenc set right word GO trust thi show faint heart timid thi show pull punch regard drug sex violenc hardcor classic use wordit call OZ nicknam given oswald maximum secur state penitentari focus mainli emerald citi experiment section prison cell glass front face inward privaci high agenda Em citi home manyaryan muslim gangsta latino christian italian irish moreso scuffl death stare dodgi deal shadi agreement never far awayi would say main appeal show due fact goe show wouldnt dare forget pretti pictur paint mainstream audienc forget charm forget romanceoz doesnt mess around first episod ever saw struck nasti wa surreal couldnt say wa readi watch develop tast Oz got accustom high level graphic violenc violenc injustic crook guard wholl sold nickel inmat wholl kill order get away well manner middl class inmat turn prison bitch due lack street skill prison exp

In [45]:
test_reviews=imdb_data.review[40000:]
test_reviews[40000]

'first want say lean liber polit scale found movi offens manag watch whole doggon disgrac film thi movi bring low origin idea ye wa origin thu 2 star instead 1 film writer uncr onli come thi act wa horribl charact unlik part lead ladi stori good qualiti made bf sort bad guy see mayb miss someth knowh wa earth relev charact movi shell ani money thi garbag almost wish peta would come rescu thi aw offens movi form protest disgust say anymor'

In [14]:
#Bags of words 모델

from sklearn.feature_extraction.text import CountVectorizer

cv=CountVectorizer(min_df=0,max_df=1,binary=False,ngram_range=(1,3))

cv_train_reviews=cv.fit_transform(train_reviews)
cv_test_reviews=cv.transform(test_reviews)

print('BOW_cv_train:',cv_train_reviews.shape)
print('BOW_cv_test:',cv_test_reviews.shape)

BOW_cv_train: (40000, 6209086)
BOW_cv_test: (10000, 6209086)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tv = TfidfVectorizer(min_df=0, max_df=1, use_idf=True, ngram_range=(1,3))
#use_idf : TF-IDF를 사용해 피처를 만들 것인지 아니면 단어 빈도 자체를 사용할 것인지 여부

tv_train_reviews=tv.fit_transform(train_reviews)
tv_test_reviews=tv.transform(test_reviews)

print('Tfidf_train:',tv_train_reviews.shape)
print('Tfidf_test:',tv_test_reviews.shape)

Tfidf_train: (40000, 6209086)
Tfidf_test: (10000, 6209086)


In [18]:
#sentiment 라벨링

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

sentiment_data = lb.fit_transform(imdb_data['sentiment'])
print(sentiment_data)

[[1]
 [1]
 [1]
 ...
 [0]
 [0]
 [0]]


In [23]:
train_sentiments = sentiment_data[:40000]
test_sentiments = sentiment_data[40000:]

print('train_sentiments')
print(train_sentiments)
print('\ntest_sentiments')
print(test_sentiments)

train_sentiments
[[1]
 [1]
 [1]
 ...
 [1]
 [0]
 [0]]

test_sentiments
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [25]:
# 데이터셋 모델링

from sklearn.linear_model import LogisticRegression, SGDClassifier

lr = LogisticRegression(penalty='l2',max_iter=500, C=1, random_state=42)

lr_bow = lr.fit(cv_train_reviews,train_sentiments)
print(lr_bow)

lr_tfidf = lr.fit(tv_train_reviews,train_sentiments)
print(lr_tfidf)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)
LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)


In [31]:
lr_bow_predict = lr.predict(cv_test_reviews)
print('lr_bow_predict : ' , lr_bow_predict)

lr_tfidf_predict = lr.predict(tv_test_reviews)
print('tv_test_reviews : ', lr_tfidf_predict)

lr_bow_predict :  [0 0 0 ... 0 1 1]
tv_test_reviews :  [0 0 0 ... 0 1 1]


In [34]:
# 모델 정확도

from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

lr_bow_score=accuracy_score(test_sentiments,lr_bow_predict)
print("lr_bow_score :",lr_bow_score)

lr_tfidf_score=accuracy_score(test_sentiments,lr_tfidf_predict)
print("lr_tfidf_score :",lr_tfidf_score)

lr_bow_score : 0.7512
lr_tfidf_score : 0.75


In [41]:
print('classification report for bag of words')

lr_bow_report=classification_report(test_sentiments,lr_bow_predict,target_names=['Positive','Negative'])
print(lr_bow_report)
print('\nclassification report for tfidf features')
lr_tfidf_report=classification_report(test_sentiments,lr_tfidf_predict,target_names=['Positive','Negative'])
print(lr_tfidf_report)

classification report for bag of words
              precision    recall  f1-score   support

    Positive       0.75      0.75      0.75      4993
    Negative       0.75      0.75      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000


classification report for tfidf features
              precision    recall  f1-score   support

    Positive       0.74      0.77      0.75      4993
    Negative       0.76      0.73      0.75      5007

    accuracy                           0.75     10000
   macro avg       0.75      0.75      0.75     10000
weighted avg       0.75      0.75      0.75     10000

