In [33]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import warnings
warnings.filterwarnings('ignore')
url = url = 'https://raw.githubusercontent.com/ukairia777/tensorflow-nlp-tutorial/main/10.%20RNN%20Text%20Classification/dataset/spam.csv'

In [34]:
# 데이터/텍스트 전처리
df = pd.read_csv(url, encoding='latin1')
df =df[['v1','v2']]
df.drop_duplicates(subset=['v2'],inplace=True)
df.v1 =df.v1.replace(['ham','spam'],[0,1])
df.v2 =df.v2.str.replace('[^A-Za-z]',' ',regex=True)
df.head(3)

Unnamed: 0,v1,v2
0,0,Go until jurong point crazy Available only ...
1,0,Ok lar Joking wif u oni
2,1,Free entry in a wkly comp to win FA Cup fina...


In [35]:
#TfidfVectorizer와 LogisticRegression을 이용하여 이진 분류를 하되
from sklearn.model_selection import train_test_split 
X_train, X_test, y_train, y_test = train_test_split(
    df.v2.values, df.v1.values, stratify=df.v1.values, test_size=0.2, random_state=2023
)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4135,), (1034,), (4135,), (1034,))

In [36]:
# params = {'CVECT__ngram_range': [(1,1), (1,2)],'RFC__max_depth': [2,5,8]}

In [37]:
tvect =TfidfVectorizer(stop_words='english')
rfc = RandomForestClassifier(random_state=2023)
pipeline = Pipeline([('CVECT', tvect), ('RFC', rfc)])
grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
%time grid_pipe.fit(X_train, y_train)

CPU times: total: 7.02 s
Wall time: 7.23 s


In [38]:
# params = {'RFC__max_depth': [16,18,20]}
# grid_pipe = GridSearchCV(pipeline, params, scoring='accuracy', cv=3)
# %time grid_pipe.fit(X_train, y_train)
# grid_pipe.best_params_

In [39]:
# grid_pipe.best_estimator_.score(X_test, y_test)

In [40]:
tvect.fit(X_train)
X_train_cv =tvect.transform(X_train) # transform  단어의 등장 횟수를 계산
X_test_cv =tvect.transform(X_test)
X_train_cv.shape,X_test_cv.shape

((4135, 6494), (1034, 6494))

In [41]:
from sklearn.linear_model import LogisticRegression
lrc =LogisticRegression(random_state=2023,max_iter=500)
%time lrc.fit(X_train_cv,y_train)

CPU times: total: 62.5 ms
Wall time: 62 ms


In [42]:
lrc.score(X_test_cv,y_test)

0.9458413926499033

In [43]:
cvect2 =CountVectorizer(stop_words='english',ngram_range=(1,2))
cvect2.fit(X_train)
X_train_cv2 =cvect2.transform(X_train) # transform  단어의 등장 횟수를 계산
X_test_cv2 =cvect2.transform(X_test)
X_train_cv2.shape,X_test_cv2.shape

((4135, 28822), (1034, 28822))

In [44]:
lrc2 =LogisticRegression(random_state=2023,max_iter=500)
%time lrc2.fit(X_train_cv2,y_train)

CPU times: total: 250 ms
Wall time: 286 ms


In [45]:
lrc2.score(X_test_cv2,y_test)

0.9680851063829787

#### 네이버 쇼핑 리뷰 

In [47]:
# 1. 데이터/텍스트 전처리
url ='https://raw.githubusercontent.com/bab2min/corpus/master/sentiment/naver_shopping.txt'
df =pd.read_table(url,names=['score','review'])
df.score =df.score.apply(lambda x : 1 if x >=4 else 0)
df.head()

Unnamed: 0,score,review
0,1,배공빠르고 굿
1,0,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고
2,1,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...
3,0,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...
4,1,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ


In [48]:
#### 2. 데이터 전처리
df.isna().sum().sum()
print(df.shape)
print(df.review.nunique())
df.drop_duplicates(subset=['review'],inplace=True)
df.review =df.review.str.replace('[^ㄱ-ㅎ ㅏ-ㅣ 가-힣 ]','',regex=True).str.strip()
df.isna().sum().sum()


(200000, 2)
199908


0

Okt를 사용하여 한글 형태소 분석을 하세요.(10)

In [50]:
with open('data/불용어.txt') as st:
    lines = st.readlines()
stop_words = [line.split('\t')[0] for line in lines]
from konlpy.tag import Okt
okt =Okt()

In [51]:
%%time
reviews =[]
for review in df.review:
    morphs =okt.morphs(review,stem=True)
    tmp = [word for word in morphs if word not in stop_words]
    reviews.append(' '.join(tmp))

CPU times: total: 9min 54s
Wall time: 9min 55s


In [52]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(
    reviews,df.score.values,stratify=df.score.values,random_state=2023
)

In [54]:
from sklearn.linear_model import LogisticRegression
pipe2 =Pipeline([
    ('tvect',CountVectorizer(ngram_range=(1,2),max_df =0.95)),
    ('lr',LogisticRegression(max_iter=500,random_state=2023)),

])
%time pipe2.fit(X_train,y_train)

CPU times: total: 51.5 s
Wall time: 46.4 s


In [55]:
pipe2.score(X_test,y_test)

0.8971526902375092