### 🔎 로지스틱 회귀
- 리뷰길이 StandardScaler 적용 + PCA(주성분 1,000개)

In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df = pd.read_csv('./shampoo_final.csv')
df['긍정부정'] = df['긍정부정'].replace({'y': 1, 'n': 0})

print(df.shape)
df.head()

(10279, 4)


Unnamed: 0,리뷰,평점,긍정부정,리뷰길이
0,진짜 머리 감고 나면 너무 너무 간지러워 요상쾌함이 오래가지도 못하고 냄새도 빨리 나요,1,0,52
1,비듬 생겼어요 어쩐지간지럽더라 개나 샀는데 누구 주지도 못하고 아오,1,0,50
2,뚜껑 펌프 고장 났네요 아무리 눌러도 안 나와서 뚜껑 열고 사용하다 방치했어요,1,0,40
3,머리카락에 부담이 안 되고 머리 피부에 부담이 안 되니 좋죠,1,0,27
4,샴푸는 사용해보고 살수가 없으니 오랜 기간 동안 고민했고 리뷰들도 꼼꼼히 보았는데 ...,1,0,510


In [2]:
std1 = (df['긍정부정'] == 0) & ((df['평점'] == 4) | (df['평점'] == 5))
std2 = (df['긍정부정'] == 1) & ((df['평점'] == 1) | (df['평점'] == 2) | (df['평점'] == 3))

df.loc[std1, '평점'] = 2
df.loc[std2, '평점'] = 4

df['평점'].value_counts()

평점
4    3507
5    2662
2    1626
3    1324
1    1160
Name: count, dtype: int64

In [3]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()

review_len_scaled = ss.fit_transform(
  np.array(df['리뷰길이']).reshape((-1, 1)))

In [4]:
df['리뷰길이'] = np.array(review_len_scaled).reshape(-1, )

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
  df[['리뷰', '긍정부정', '리뷰길이']], df['평점'],
  stratify=df['평점'],
  test_size=.25)

print("Train Size: ", len(X_train))
print("Test Size: ", len(X_test))

Train Size:  7709
Test Size:  2570


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Okt

t = Okt()

def tokenizer1(txt):
  result = []
  for word, tag in t.pos(txt, norm=True, stem=True):
    if tag in ['Noun', 'Verb', 'Adjective']:
      result.append(word)
  return result

tfidf1 = TfidfVectorizer(
  tokenizer=tokenizer1, max_features=3000,
  min_df=5, max_df=.5)

X_train_tfidf1 = tfidf1.fit_transform(X_train['리뷰'])
X_test_tfidf1 = tfidf1.transform(X_test['리뷰'])



In [7]:
import numpy as np

X_train_combined = np.hstack(
  (X_train_tfidf1.toarray(),
    np.array(X_train['긍정부정']).reshape((-1, 1)),
    np.array(X_train['리뷰길이']).reshape((-1, 1))))

X_test_combined = np.hstack(
  (X_test_tfidf1.toarray(),
    np.array(X_test['긍정부정']).reshape((-1, 1)),
    np.array(X_test['리뷰길이']).reshape((-1, 1))))

In [8]:
from sklearn.decomposition import PCA

pca = PCA(n_components=1000)
X_train_pca = pca.fit_transform(X_train_combined)
X_test_pca = pca.transform(X_test_combined)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

params = {
  'C': [.001, .01, .1, 1, 10, 100],
}

# ✅ solver 
# For multiclass problems, only ‘newton-cg’, ‘sag’, ‘saga’ and ‘lbfgs’ handle multinomial loss;
lr_clf = LogisticRegression(max_iter=1500, solver='newton-cg', penalty='l2', random_state=13)

gs_cv = GridSearchCV(lr_clf, param_grid=params, n_jobs=-1, refit=True)

In [10]:
gs_cv.fit(X_train_pca, y_train)

In [11]:
gs_cv.best_params_

{'C': 1}

In [12]:
round(gs_cv.best_score_, 4)

0.6787

In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score

estimator = gs_cv.best_estimator_
pred = estimator.predict(X_test_pca)
print(round(accuracy_score(y_test, pred), 4))
print(round(recall_score(y_test, pred, average='macro'), 4))
print(round(precision_score(y_test, pred, average='macro'), 4))
print(round(f1_score(y_test, pred, average='macro'), 4))

0.6712
0.6088
0.6242
0.6129


In [14]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, pred)

array([[158,  97,  35,   0,   0],
       [ 62, 226, 118,   0,   0],
       [ 41, 154, 136,   0,   0],
       [  0,   0,   0, 769, 108],
       [  0,   0,   0, 230, 436]])

In [15]:
from sklearn.metrics import classification_report

print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           1       0.61      0.54      0.57       290
           2       0.47      0.56      0.51       406
           3       0.47      0.41      0.44       331
           4       0.77      0.88      0.82       877
           5       0.80      0.65      0.72       666

    accuracy                           0.67      2570
   macro avg       0.62      0.61      0.61      2570
weighted avg       0.67      0.67      0.67      2570

