### 🔎 로지스틱 회귀
- 리뷰길이 MinMaxScaler 적용

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

df = pd.read_csv('./shampoo_final.csv')
df['긍정부정'] = df['긍정부정'].replace({'y': 1, 'n': 0})

print(df.shape)
df.head()

(10279, 4)


Unnamed: 0,리뷰,평점,긍정부정,리뷰길이
0,진짜 머리 감고 나면 너무 너무 간지러워 요상쾌함이 오래가지도 못하고 냄새도 빨리 나요,1,0,52
1,비듬 생겼어요 어쩐지간지럽더라 개나 샀는데 누구 주지도 못하고 아오,1,0,50
2,뚜껑 펌프 고장 났네요 아무리 눌러도 안 나와서 뚜껑 열고 사용하다 방치했어요,1,0,40
3,머리카락에 부담이 안 되고 머리 피부에 부담이 안 되니 좋죠,1,0,27
4,샴푸는 사용해보고 살수가 없으니 오랜 기간 동안 고민했고 리뷰들도 꼼꼼히 보았는데 ...,1,0,510


##### 🔸 평점 수정
- 높은 평점 + 부정 리뷰 = 2점으로 통일
- 낮은 평점 + 긍정 리뷰 = 4점으로 통일

In [None]:
std1 = (df['긍정부정'] == 0) & ((df['평점'] == 4) | (df['평점'] == 5))
std2 = (df['긍정부정'] == 1) & ((df['평점'] == 1) | (df['평점'] == 2) | (df['평점'] == 3))

df.loc[std1, '평점'] = 2
df.loc[std2, '평점'] = 4

df['평점'].value_counts()

평점
4    3507
5    2662
2    1626
3    1324
1    1160
Name: count, dtype: int64

In [None]:
# from sklearn.preprocessing import MinMaxScaler

# mms = MinMaxScaler()

# review_len_scaled = mms.fit_transform(
#   np.array(df['리뷰길이']).reshape((-1, 1)))

In [None]:
# df['리뷰길이'] = np.array(review_len_scaled).reshape(-1, )

##### 🔸 train/test 데이터 분리

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
  df[['리뷰', '긍정부정', '리뷰길이']], df['평점'],
  stratify=df['평점'],
  test_size=.25)

print("Train Size: ", len(X_train))
print("Test Size: ", len(X_test))

Train Size:  7709
Test Size:  2570


##### 🔸 토크나이징

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from konlpy.tag import Okt

t = Okt()

def tokenizer1(txt):
  result = []
  for word, tag in t.pos(txt, norm=True, stem=True):
    if tag in ['Noun', 'Verb', 'Adjective']:
      result.append(word)
  return result

tfidf1 = TfidfVectorizer(
  tokenizer=tokenizer1, max_features=1000,
  min_df=5, max_df=.5)

X_train_tfidf1 = tfidf1.fit_transform(X_train['리뷰'])
X_test_tfidf1 = tfidf1.transform(X_test['리뷰'])



##### 🔹 토큰화 벡터 + 긍정부정 + 리뷰길이

In [None]:
import numpy as np

X_train_combined = np.hstack(
  (X_train_tfidf1.toarray(),
    np.array(X_train['긍정부정']).reshape((-1, 1)),
    np.array(X_train['리뷰길이']).reshape((-1, 1))))

X_test_combined = np.hstack(
  (X_test_tfidf1.toarray(),
    np.array(X_test['긍정부정']).reshape((-1, 1)),
    np.array(X_test['리뷰길이']).reshape((-1, 1))))

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# params = {
#   'C': [.01, .1, 1, 10],
#   'penalty': ['l1', 'l2']
# }

# ✅ solver
lr_clf = LogisticRegression(max_iter=1500, solver='lbfgs', random_state=13)

lr_clf.fit(X_train_combined, y_train)

# gs_cv = GridSearchCV(lr_clf, param_grid=params, n_jobs=-1, refit=True)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# gs_cv.fit(X_train_combined, y_train)

In [None]:
# gs_cv.best_params_

In [None]:
# round(gs_cv.best_score_, 4)

In [None]:
# from sklearn.metrics import accuracy_score
# from sklearn.metrics import recall_score
# from sklearn.metrics import precision_score
# from sklearn.metrics import f1_score

# estimator = gs_cv.best_estimator_
# pred = estimator.predict(X_test_combined)
# print(round(accuracy_score(y_test, pred), 4))
# print(round(recall_score(y_test, pred, average='macro'), 4))
# print(round(precision_score(y_test, pred, average='macro'), 4))
# print(round(f1_score(y_test, pred, average='macro'), 4))

---

In [None]:
lr_clf.score(X_train_combined, y_train)

0.774678946685692

In [None]:
lr_clf.score(X_test_combined, y_test)

0.6859922178988327