In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from konlpy.tag import Okt
import pandas as pd
import pickle

In [2]:
df = pd.read_csv("../../data/naver_shopping_tokenized.csv", encoding = 'utf-8', index_col = 0)

In [3]:
df.head()

Unnamed: 0,rating,text,y,tokenized
0,5,배공빠르고 굿,1,배공
1,2,택배가 엉망이네용 저희집 밑에층에 말도없이 놔두고가고,0,택배 엉망
2,5,아주좋아요 바지 정말 좋아서2개 더 구매했어요 이가격에 대박입니다. 바느질이 조금 ...,1,아주 바지 정말 구매 가격 대박 바느질 조금 가성 최고
3,2,선물용으로 빨리 받아서 전달했어야 하는 상품이었는데 머그컵만 와서 당황했습니다. 전...,0,선물 전달 상품 머그컵 당황 바로 누락 확인 바로 선물 큰일 다시 생각
4,5,민트색상 예뻐요. 옆 손잡이는 거는 용도로도 사용되네요 ㅎㅎ,1,민트 색상 손잡이 도로 사용


In [3]:
from sklearn.model_selection import train_test_split
y = df['y'].values.tolist()
X_train_texts, X_test_texts, y_train, y_test = train_test_split(df['text'], y, test_size = 0.2, random_state = 0)

In [4]:
okt = Okt()
def okt_tokenizer(text):
    tokens_ko = okt.morphs(text)
    return tokens_ko

## DTM

In [5]:
vect = CountVectorizer(tokenizer = okt_tokenizer, min_df = 3, ngram_range=(1,2))
X_train_tf = vect.fit_transform(X_train_texts)
X_test_tf = vect.transform(X_test_texts)



In [6]:
with open('saved_CounterVectorizer.pickle','wb') as fw:
    pickle.dump(vect, fw)

In [7]:
from sklearn.linear_model import LogisticRegression
lr_DTM = LogisticRegression(random_state = 0, solver = 'liblinear', C = 1)
lr_DTM.fit(X_train_tf, y_train)
y_pred_DTM = lr_DTM.predict(X_test_tf)

In [8]:
with open('saved_LogisticRegression_DTM.pickle','wb') as fw:
    pickle.dump(lr_DTM, fw)

## TF-IDF

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2), min_df = 3, max_df = 0.9, tokenizer = okt_tokenizer)
tfidf_vectorizer.fit(X_train_texts)
tfidf_matrix_train = tfidf_vectorizer.transform(X_train_texts)
tfidf_matrix_test = tfidf_vectorizer.transform(X_test_texts)



In [10]:
with open('saved_TfidfVectorizer.pickle','wb') as fw:
    pickle.dump(tfidf_vectorizer, fw)

In [11]:
lr_tfidf = LogisticRegression(random_state = 0, solver = 'liblinear', C = 3.5)
lr_tfidf.fit(tfidf_matrix_train, y_train)

In [12]:
with open('saved_LogisticRegression_TFIDF.pickle','wb') as fw:
    pickle.dump(lr_tfidf, fw)

## 크롤링한 남성용 의류 데이터 감성분류(DTM)

In [13]:
man = pd.read_csv("./men_review_musinsa.csv", encoding = 'utf-8', index_col = 0)

In [14]:
man.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 9950 entries, 0 to 9949
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       9950 non-null   object
dtypes: object(1)
memory usage: 155.5+ KB


In [15]:
man.head()

Unnamed: 0,0
0,조아요조아요조아요조아요조아요조아요조아요
1,사이즈 선택에 있어서 교환도 하고 많이 힘들었지만 옷은 가격 대비 좋아요
2,원단이 잘 늘어나서 좋습니다 활동하는데 거슬리는게 없습니드
3,무탠다드 정말 인정!! 앞으로 자주 입을꺼 같앙!! 최고
4,할인해서 좋은상품 잘구매한것 같습니다~'' 이쁘고 깔금하고 사이즈도 딱 좋습니다~^^


In [16]:
with open('saved_CounterVectorizer.pickle','rb') as f:
    countvectorizer = pickle.load(f)

In [18]:
vectorized_review_m = countvectorizer.transform(man[man.columns[0]])

In [19]:
with open('saved_LogisticRegression_DTM.pickle','rb') as f:
    lr_DTM = pickle.load(f)

In [21]:
semantic_analysis_m = lr_DTM.predict(vectorized_review_m)

In [22]:
man['semantic'] = semantic_analysis_m

In [23]:
man['semantic'].value_counts()

1    9088
0     862
Name: semantic, dtype: int64

In [33]:
man.columns = ['text', 'semantic_DTM']

## 크롤링한 여성의류 데이터 감성분류(DTM)

In [24]:
woman = pd.read_csv("./women_review_auction.csv", encoding = 'utf-8', index_col =  0)

In [25]:
woman.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7924 entries, 0 to 7923
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   0       7924 non-null   object
dtypes: object(1)
memory usage: 123.8+ KB


In [27]:
woman.head()

Unnamed: 0,0
0,디자인 기장 문안하고 깔끔해요
1,단추나 지퍼가 없지만 가볍게 걸치긴 좋았어요 실내에서 자주 걸치고 있었는데 보시는 ...
2,너무 스키니함..;
3,예상보다 괜찮음!
4,그냥그래요


In [26]:
vectorized_review_w = countvectorizer.transform(woman[woman.columns[0]])

In [28]:
semantic_analysis_w = lr_DTM.predict(vectorized_review_w)

In [29]:
woman['semantic'] = semantic_analysis_w

In [30]:
woman['semantic'].value_counts()

1    5881
0    2043
Name: semantic, dtype: int64

In [34]:
woman.columns = ['text', 'semantic_DTM']

## 크롤링한 남성용 의류 데이터 감성분류(TF-IDF)

In [37]:
with open('saved_TfidfVectorizer.pickle','rb') as f:
    Tfidfvectorizer = pickle.load(f)

In [38]:
vectorized_review_m_2 = Tfidfvectorizer.transform(man['text'])

In [39]:
with open('saved_LogisticRegression_TFIDF.pickle', 'rb') as f:
    lr_tfidf = pickle.load(f)

In [40]:
predict = lr_tfidf.predict(vectorized_review_m_2)

In [41]:
man['semantic_TFIDF'] = predict

In [42]:
man['semantic_TFIDF'].value_counts()

1    9067
0     883
Name: semantic_TFIDF, dtype: int64

## 크롤링한 여성의류 데이터 감성분류(TF-IDF)

In [43]:
vectorized_review_w_2 = Tfidfvectorizer.transform(woman['text'])

In [44]:
predict2 = lr_tfidf.predict(vectorized_review_w_2)

In [46]:
woman['semantic_TFIDF'] = predict2

In [47]:
woman['semantic_TFIDF'].value_counts()

1    5900
0    2024
Name: semantic_TFIDF, dtype: int64

**DTM과 TF-IDF와 DTM 비교**

In [52]:
df_merge = pd.concat([man, woman])

In [53]:
df_merge.head()

Unnamed: 0,text,semantic_DTM,semantic_TFIDF
0,조아요조아요조아요조아요조아요조아요조아요,1,1
1,사이즈 선택에 있어서 교환도 하고 많이 힘들었지만 옷은 가격 대비 좋아요,0,0
2,원단이 잘 늘어나서 좋습니다 활동하는데 거슬리는게 없습니드,1,1
3,무탠다드 정말 인정!! 앞으로 자주 입을꺼 같앙!! 최고,1,1
4,할인해서 좋은상품 잘구매한것 같습니다~'' 이쁘고 깔금하고 사이즈도 딱 좋습니다~^^,1,1


In [60]:
df_merge.reset_index(inplace= True, drop = True)

In [63]:
df_merge.groupby(['semantic_DTM', 'semantic_TFIDF']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,text
semantic_DTM,semantic_TFIDF,Unnamed: 2_level_1
0,0,2721
0,1,184
1,0,186
1,1,14783


In [64]:
df_merge.to_csv("man_and_woman_predict_semantic_analysis.csv", encoding= 'utf-8')