<a href="https://colab.research.google.com/github/lishavin/Amazon/blob/main/TF-IDF%2BLogistic_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Required Packages

- 파일 > Drive에 사본 저장
- 아래 패키지 설치 후 session restart 필요 (런타임 > 세션다시시작)
- drive mount (pretrained weight)

In [None]:
!pip install datasets
!pip install numpy==1.26
!pip install scipy==1.13.1
!pip install gensim
!pip install fsspec==2023.4.0 #"**" 경로 패턴 호환되는 fsspec 옛날버전으로 다운그레이드

!pip install nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import nltk #토큰화, 불용어 제거, 표제어 추출 등을 위한 자원 다운로드
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

## 1. Load Data

In [None]:
from datasets import load_dataset
df = load_dataset("fancyzhx/amazon_polarity")

# train과 test를 각각 pandas DataFrame으로 변환
train_df = df["train"].select(range(10000)).to_pandas()
test_df = df["test"].select(range(5000)).to_pandas()

In [None]:
type(train_df)
type(test_df)

## 2. EDA


- 1 : positive
- 0 : negative

In [None]:
train_df.head()

In [None]:
#data type 확인
print("Train Dataset DataType")
print(train_df.dtypes)

print("\nTest Dataset DataType")
print(test_df.dtypes)

In [None]:
train_df.isnull()

In [None]:
#결측치 확인
print("Train Dataset 결측치")
print(train_df.isnull().sum())

print("\nTest Dataset 결측치")
print(test_df.isnull().sum())

In [None]:
#label 비율 확인
import seaborn as sns
sns.countplot(x='label', data= train_df)
print(train_df.label.value_counts())

In [None]:
#label 비율 확인
sns.countplot(x='label', data= test_df)
print(test_df.label.value_counts())

In [None]:
train_df

## 3. Text Cleaning



>1. 정규표현식
  - HTML 태그 제거
  - 특수문자 제거
2. 소문자 변환
3. Stopwords 제거
4. Stemming (어간 추출) /Lemmatization (표제어 추출)



##### Amazon 데이터에 적용

In [None]:
# title과 content 열 합치기
train_df["review"] = train_df["title"] + " " + train_df["content"]
test_df["review"] = test_df["title"] + " " + test_df["content"]

In [None]:
import re #정규표현식 regular expression: HTML 태그나 특수문자 제거시 사용
import nltk
from nltk.corpus import stopwords #nltk에서 제공하는 불용어(stopwords) 리스트를 불러오기 위한 것
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer #어간 추출, 표제어 추출

# 처음 한 번은 다운로드 필요
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_review(review):
    # HTML 태그 제거
    review = re.sub('<[^>]*>', '', review)
    # 특수 문자 제거 (!는 남기기)
    review = re.sub('[^a-zA-Z0-9 ?]', '', review)
    # 소문자 변환
    review = review.lower()
    # 토큰화
    tokens = word_tokenize(review)
    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # 표제어 추출
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

In [None]:
# 전처리 적용
train_df['processed_review'] = train_df['review'].apply(preprocess_review)
test_df['processed_review'] = test_df['review'].apply(preprocess_review)

In [None]:
train_df.head()

## 4. Train-Valid Split

In [None]:
from sklearn.model_selection import train_test_split

# 80% train, 20% validation
data_train, data_valid = train_test_split(
    train_df,                  # 원래 train 데이터셋
    test_size=0.2,             # 20%는 validation으로
    stratify=train_df['label'], # label 비율 유지 (긍/부정 균형)
    random_state=42           # 재현성 (같은 split 결과)
)

data_test = test_df           # 테스트는 이미 별도로 있음

In [None]:
len(data_train)

In [None]:
len(data_valid)

## 5. Vectorization + Classifier

##### **TF-IDF Vectorizer**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# TF-IDF 벡터화 (최대 10000개의 단어 사용)
vectorizer = TfidfVectorizer(max_features=10000)

# 학습 데이터로 학습 + 변환
X_train = vectorizer.fit_transform(data_train['processed_review'])
X_valid = vectorizer.transform(data_valid['processed_review'])
X_test = vectorizer.transform(data_test['processed_review'])

# 감성 분류의 정답(label)값을 따로 분리해서 저장
y_train = data_train["label"]
y_valid = data_valid["label"]
y_test = data_test["label"]

In [None]:
#벡터 확인
print(X_train[0].toarray()[0][170:250])
print(len(X_train[0].toarray()[0])) # 총 벡터 길이: 10,000

In [None]:
# 0이 아닌 항목 수 확인: 14개로 잘 나옴
nonzero_count = (X_train[0].toarray() != 0).sum()
print("Non-zero TF-IDF features:", nonzero_count)

##### **Logistic Regression**

- 결과에 대한 확률값 출력 가능
- 선형 결정 경계만 학습 가능 (복잡한 패턴은 한계가 있음) <br>
[참고docs](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
#model load
log_clf = LogisticRegression(max_iter=1000)

#train
log_clf.fit(X_train, y_train)

#prediction
lr_valid_preds = log_clf.predict(X_valid)
lr_test_preds = log_clf.predict(X_test)

print("Validation Accuracy:", accuracy_score(y_valid, lr_valid_preds))
print("Test Accuracy:", accuracy_score(y_test, lr_test_preds))
print("\n[Classification Report on Test Set]\n", classification_report(y_test, lr_test_preds))

# Cross Validation 1
K-fold 선택: 튜닝 전 baseline 성능을 확인


In [None]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []  # 정확도 저장용 리스트 선언
y_train = y_train.values  # 또는 y_train = np.array(y_train)

# 그 다음 K-Fold 코드 정상 작동
for fold, (train_index, valid_index) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train[train_index], X_train[valid_index]
    y_tr, y_val = y_train[train_index], y_train[valid_index]

    log_clf.fit(X_tr, y_tr)
    val_preds = log_clf.predict(X_val)
    acc = accuracy_score(y_val, val_preds)
    fold_accuracies.append(acc)

    print(f"[Pre-Tuning Fold {fold+1}] Validation Accuracy: {acc:.4f}")

print(f"\n🔎 Average Validation Accuracy (Pre-Tuning 5-Fold): {np.mean(fold_accuracies):.4f}")

# Hyperparameter Tuning

이미 모델 성능이 좋아서 정밀한 전수조사(Grid)보다는 	빠르게 근사 최적값을 찾는 게 더 중요해서 Random Search로 최적 C, penalty 값 찾기


In [None]:
#RandomizedSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from scipy.stats import uniform
from sklearn.metrics import accuracy_score, classification_report

# 하이퍼파라미터 범위 설정
param_dist = {
    'C': uniform(loc=0.001, scale=10),  # 0.001 ~ 10 사이 연속값
    'penalty': ['l1', 'l2'],
    'solver': ['liblinear']  # l1을 지원하는 solver
}

# RandomizedSearchCV 정의
random_search = RandomizedSearchCV(
    estimator=LogisticRegression(max_iter=1000),
    param_distributions=param_dist,
    n_iter=20,                  # 20개의 조합을 무작위로 시도
    scoring='accuracy',
    cv=5,
    random_state=42,
    verbose=1,
    n_jobs=-1                  # 가능한 모든 CPU 사용
)

# 학습
random_search.fit(X_train, y_train)

# 최적 하이퍼파라미터 출력
print("Best Parameters:", random_search.best_params_)
print("Best CV Accuracy:", random_search.best_score_)

# Cross Validation 2
튜닝 후 최종 모델을 재검증: RandomizedSearch 결과가 얼마나 개선됐는지 비교

In [None]:
# RandomizedSearchCV 이후 최적 모델 가져오기
best_model = random_search.best_estimator_

# 튜닝 후 K-Fold 재설정
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tuned_fold_accuracies = []

for fold, (train_index, valid_index) in enumerate(kf.split(X_train)):
    X_tr, X_val = X_train[train_index], X_train[valid_index]
    y_tr, y_val = y_train[train_index], y_train[valid_index]

    best_model.fit(X_tr, y_tr)
    val_preds = best_model.predict(X_val)
    acc = accuracy_score(y_val, val_preds)
    tuned_fold_accuracies.append(acc)

    print(f"[Post-Tuning Fold {fold+1}] Validation Accuracy: {acc:.4f}")

print(f"\n✅ Average Validation Accuracy (Post-Tuning 5-Fold): {np.mean(tuned_fold_accuracies):.4f}")

# 추가 분석


### 감성에 영향을 주는 주요 단어 분석 (model.coef_)

In [None]:
import numpy as np

# 가장 영향력 있는 단어
feature_names = vectorizer.get_feature_names_out()
coefficients = log_clf.coef_[0]

# 긍정 영향력이 큰 단어 상위 10개
top_pos_indices = np.argsort(coefficients)[-10:]
print("Top positive words:")
for idx in reversed(top_pos_indices):
    print(f"{feature_names[idx]}: {coefficients[idx]:.4f}")

# 부정 영향력이 큰 단어 하위 10개
top_neg_indices = np.argsort(coefficients)[:10]
print("\nTop negative words:")
for idx in top_neg_indices:
    print(f"{feature_names[idx]}: {coefficients[idx]:.4f}")

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

# 긍정 단어
top_pos_words = {feature_names[idx]: coefficients[idx] for idx in top_pos_indices}
wordcloud_pos = WordCloud(width=600, height=400, background_color='white', colormap='Blues').generate_from_frequencies(top_pos_words)

plt.figure(figsize=(8, 4))
plt.imshow(wordcloud_pos, interpolation='bilinear')
plt.axis('off')
plt.title("Top Positive Words")
plt.show()

# 부정 단어
top_neg_words = {feature_names[idx]: abs(coefficients[idx]) for idx in top_neg_indices}
wordcloud_neg = WordCloud(width=600, height=400, background_color='white', colormap='Reds').generate_from_frequencies(top_neg_words)

plt.figure(figsize=(8, 4))
plt.imshow(wordcloud_neg, interpolation='bilinear')
plt.axis('off')
plt.title("Top Negative Words")
plt.show()

### 제품 카테고리별 감성 분포 분석

In [None]:
# 제품군 분류 함수 정의
def classify_product(text):
    text = text.lower()
    if "book" in text:
        return "Book"
    elif "camera" in text or "lens" in text:
        return "Electronics"
    elif "shoe" in text or "shirt" in text:
        return "Clothing"
    else:
        return "Other"

train_df["product_group"] = train_df["review"].apply(classify_product)

In [None]:
# 제품군별 감성 분포 시각화
import seaborn as sns
import matplotlib.pyplot as plt

sns.countplot(data=train_df, x="product_group", hue="label")
plt.title("Sentiment Distribution by Product Group")
plt.xlabel("Product Category")
plt.ylabel("Count")
plt.show()

### 리뷰 길이 vs 감성 관계

In [None]:
# 리뷰 길이 추가
train_df["review_length"] = train_df["review"].apply(lambda x: len(x.split()))

# 시각화
sns.boxplot(data=train_df, x="label", y="review_length")
plt.title("Review Length by Sentiment")
plt.xlabel("Sentiment (0=Neg, 1=Pos)")
plt.ylabel("Number of Words")
plt.show()

### 분류기 성능 비교(SVM, Naive Bayes, Decision Tree)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier

models = {
    "SVM": LinearSVC(),
    "Naive Bayes": MultinomialNB(),
    "Decision Tree": DecisionTreeClassifier()
}

for name, clf in models.items():
    clf.fit(X_train, y_train)
    preds = clf.predict(X_valid)
    acc = accuracy_score(y_valid, preds)
    print(f"{name} Accuracy: {acc:.4f}")