In [1]:
import pandas as pd

- 데이터 전처리

In [2]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t')
df.head()

Unnamed: 0,id,sentiment,review
0,5814_8,1,With all this stuff going down at the moment w...
1,2381_9,1,"\The Classic War of the Worlds\"" by Timothy Hi..."
2,7759_3,0,The film starts with a manager (Nicholas Bell)...
3,3630_4,0,It must be assumed that those who praised this...
4,9495_8,1,Superbly trashy and wondrously unpretentious 8...


In [7]:
df = pd.read_csv('data/labeledTrainData.tsv', sep='\t', quoting=3)
df.head()

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         25000 non-null  object
 1   sentiment  25000 non-null  int64 
 2   review     25000 non-null  object
dtypes: int64(1), object(2)
memory usage: 586.1+ KB


In [9]:
# 결측치 확인
df.isna().sum()

id           0
sentiment    0
review       0
dtype: int64

In [10]:
# 중복 확인
df.review.nunique()

24904

In [11]:
# 중복 제거
df.drop_duplicates(subset=['review'], inplace=True)
df.shape

(24904, 3)

- 텍스트 전처리

In [12]:
print(df.review[0][:1000])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when it finally sta

In [13]:
# <br />제거
df.review = df.review.str.replace('<br />', ' ')
print(df.review[0][:1000])

"With all this stuff going down at the moment with MJ i've started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again. Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent. Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released. Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.  Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring. Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.  The actual feature film bit when it finally starts is only on for 2

In [14]:
# 구둣점/소숫점 제거
df.review = df.review.str.replace('[^A-Za-z]', ' ', regex=True)

- 데이터셋 분리

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    df.review.values, df.sentiment.values, stratify=df.sentiment.values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((19923,), (4981,), (19923,), (4981,))

- 텍스트 인코딩

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect = TfidfVectorizer(stop_words='english')
tvect.fit(X_train)
X_train_tv = tvect.transform(X_train)
X_test_tv = tvect.transform(X_test)
X_train_tv.shape, X_test_tv.shape

((19923, 66641), (4981, 66641))

- 학습과 평가

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2023)
rfc.fit(X_train_tv, y_train)
rfc.score(X_test_tv, y_test)

0.8576591045974704

- bigram

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
tvect2 = TfidfVectorizer(stop_words='english', ngram_range=(1, 2))
tvect2.fit(X_train)
X_train_tv2 = tvect2.transform(X_train)
X_test_tv2 = tvect2.transform(X_test)
X_train_tv2.shape, X_test_tv2.shape

((19923, 1454639), (4981, 1454639))

In [20]:
from sklearn.ensemble import RandomForestClassifier
rfc2 = RandomForestClassifier(random_state=2023)

In [21]:
%time rfc2.fit(X_train_tv2, y_train)

CPU times: total: 19min 30s
Wall time: 19min 36s


In [22]:
rfc2.score(X_test_tv2, y_test)

0.8604697851836981

- 변환기/모델 저장/로드

In [23]:
import joblib

In [24]:
joblib.dump(tvect2, 'model/imdb_tvect_2.pkl')
joblib.dump(rfc2, 'model/imdb_rfc_2.pkl')

['model/imdb_rfc_2.pkl']

In [25]:
new_tvect = joblib.load('model/imdb_tvect_2.pkl')
new_rfc = joblib.load('model/imdb_rfc_2.pkl')

- 실제 데이터 검증

In [41]:
# 리스트 만들때 빈칸 허용 X
reviews = [
"""Aladdin was my favorite Disney movie growing up so I was worried that this live action version would get screwed up, but I think they did so well! I even liked the new song Speechless. My favorite part of the cartoon was Robin Williams as genie and those are big shoes to fill, and I really liked that Will Smith didn't try to be that version of the genie. There are times I do think he was phoning it in, but it was still good. I love that my kids are getting into Aladdin thanks to this live action movie.""",
"""The animation is one of my all time favourite Disney films. Full of laughs, heart, emotion and an incredible story. This was just a total mess. The lead actor is so incredibly wooden, and emotionless, the only good thing going for him is his fantastic dance moves (maybe stick to that from now on). Will Smith is good but forgettable. I totally get that he was trying to make it his own but I'm sorry nothing can out do Robin Williams, and unfortunately that's what this film would have needed to make it great, someone even better than the original. The actor that plays Jafar is next. AWFUL, not menacing enough and it makes for a very lack-lustre performance. Naomi Scott on the other hand was absolutely brilliant as Jasmine. Her voice when she sings Speechless gave me goosebumps. She is the only reason that I gave 2 stars instead of 1. Maybe Disney needs to stop the live action now, I'm really dreading The Lion King and Mulan now as they're both 2 more favourites. Fingers crossed."""
]

In [42]:
# 텍스트 전처리/ 구둣점 소숫점 제거
import re
reviews = map(lambda x: re.sub('[^A-Za-z]', ' ', x), reviews)

In [43]:
# load한 TfidfVectorizer 적용
reviews_tv = new_tvect.transform(reviews)
reviews_tv.shape

(2, 1454639)

In [44]:
reviews_tv.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [45]:
new_rfc.predict(reviews_tv)

array([1, 0], dtype=int64)