<a href="https://colab.research.google.com/github/lishavin/Amazon/blob/main/Glove%2BSVM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 0. Required Packages

- 파일 > Drive에 사본 저장
- 아래 패키지 설치 후 session restart 필요 (런타임 > 세션다시시작)
- drive mount (pretrained weight)

In [None]:
!pip install datasets
!pip install numpy==1.26
!pip install scipy==1.13.1
!pip install gensim
!pip install fsspec==2023.4.0 #"**" 경로 패턴 호환되는 fsspec 옛날버전으로 다운그레이드

!pip install nltk

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm
import nltk #토큰화, 불용어 제거, 표제어 추출 등을 위한 자원 다운로드
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

## 1. Load Data

In [None]:
from datasets import load_dataset
df = load_dataset("fancyzhx/amazon_polarity")

# train과 test를 각각 pandas DataFrame으로 변환
train_df = df["train"].select(range(10000)).to_pandas()
test_df = df["test"].select(range(5000)).to_pandas()

In [None]:
type(train_df)
type(test_df)

## 2. EDA


- 1 : positive
- 0 : negative

In [None]:
train_df.head()

In [None]:
#data type 확인
print("Train Dataset DataType")
print(train_df.dtypes)

print("\nTest Dataset DataType")
print(test_df.dtypes)

In [None]:
train_df.isnull()

In [None]:
#결측치 확인
print("Train Dataset 결측치")
print(train_df.isnull().sum())

print("\nTest Dataset 결측치")
print(test_df.isnull().sum())

In [None]:
#label 비율 확인
import seaborn as sns
sns.countplot(x='label', data= train_df)
print(train_df.label.value_counts())

In [None]:
#label 비율 확인
sns.countplot(x='label', data= test_df)
print(test_df.label.value_counts())

In [None]:
train_df

## 3. Text Cleaning



>1. 정규표현식
  - HTML 태그 제거
  - 특수문자 제거
2. 소문자 변환
3. Stopwords 제거
4. Stemming (어간 추출) /Lemmatization (표제어 추출)



##### Amazon 데이터에 적용

In [None]:
# title과 content 열 합치기
train_df["review"] = train_df["title"] + " " + train_df["content"]
test_df["review"] = test_df["title"] + " " + test_df["content"]

In [None]:
import re #정규표현식 regular expression: HTML 태그나 특수문자 제거시 사용
import nltk
from nltk.corpus import stopwords #nltk에서 제공하는 불용어(stopwords) 리스트를 불러오기 위한 것
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer #어간 추출, 표제어 추출

# 처음 한 번은 다운로드 필요
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_review(review):
    # HTML 태그 제거
    review = re.sub('<[^>]*>', '', review)
    # 특수 문자 제거 (!는 남기기)
    review = re.sub('[^a-zA-Z0-9 ?]', '', review)
    # 소문자 변환
    review = review.lower()
    # 토큰화
    tokens = word_tokenize(review)
    # 불용어 제거
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # 표제어 추출
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return " ".join(tokens)

In [None]:
# 전처리 적용
train_df['processed_review'] = train_df['review'].apply(preprocess_review)
test_df['processed_review'] = test_df['review'].apply(preprocess_review)

In [None]:
train_df.head()

## 4. Train-Valid Split

In [None]:
from sklearn.model_selection import train_test_split

# 80% train, 20% validation
data_train, data_valid = train_test_split(
    train_df,                  # 원래 train 데이터셋
    test_size=0.2,             # 20%는 validation으로
    stratify=train_df['label'], # label 비율 유지 (긍/부정 균형)
    random_state=42           # 재현성 (같은 split 결과)
)

data_test = test_df           # 테스트는 이미 별도로 있음

In [None]:
len(data_train)

In [None]:
len(data_valid)

## 5. Vectorization + Classifier

##### Glove


In [None]:
# GloVe 벡터 로딩
def load_glove_model(glove_file_path):
    print("Loading GloVe model...")
    glove_model = {}
    with open(glove_file_path, encoding="utf-8") as f:
        for line in tqdm(f):
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            glove_model[word] = vector
    print(f"{len(glove_model)} words loaded!")
    return glove_model

# 문장 → 평균 벡터 변환
def sentence_to_vector(sentence, glove_model, vector_dim=300):
    tokens = sentence.split()
    vecs = [glove_model[word] for word in tokens if word in glove_model]
    if not vecs:
        return np.zeros(vector_dim)
    return np.mean(vecs, axis=0)

In [None]:
# GloVe 모델 로드 (Stanford에서 사전학습한 모델)
glove_path = "/content/drive/MyDrive/weights/glove.6B.300d.txt"  # 위치 맞게 지정
glove_model = load_glove_model(glove_path)

In [None]:
glove_model['king']

In [None]:
# 각 리뷰를 벡터로 변환
tqdm.pandas()
X_train_vec = data_train['processed_review'].progress_apply(lambda x: sentence_to_vector(x, glove_model, 300))
X_valid_vec = data_valid['processed_review'].progress_apply(lambda x: sentence_to_vector(x, glove_model, 300))
X_test_vec = data_test['processed_review'].progress_apply(lambda x: sentence_to_vector(x, glove_model, 300))

# numpy array로 변환
X_train_vec = np.stack(X_train_vec.values)
X_valid_vec = np.stack(X_valid_vec.values)
X_test_vec = np.stack(X_test_vec.values)

y_train = data_train['label'].values
y_valid = data_valid['label'].values
y_test = data_test['label'].values

In [None]:
print(X_train_vec.shape)
print(X_train_vec[0][:50])

##### Linear SVM

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# SVM 모델 정의
svm_clf = SVC(kernel='linear')

# 모델 학습
svm_clf.fit(X_train_vec, y_train)

# 예측
svm_valid_preds = svm_clf.predict(X_valid_vec)
svm_test_preds = svm_clf.predict(X_test_vec)

# 성능 평가
print("Validation Accuracy:", accuracy_score(y_valid, svm_valid_preds))
print("Test Accuracy:", accuracy_score(y_test, svm_test_preds))
print("\n[Classification Report on Test Set]\n", classification_report(y_test, svm_test_preds))

# Cross Validation 1
K-fold 선택: 튜닝 전 baseline 성능을 확인


In [None]:
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
import numpy as np

kf = KFold(n_splits=5, shuffle=True, random_state=42)
fold_accuracies = []

for fold, (train_index, valid_index) in enumerate(kf.split(X_train_vec)):
    X_tr, X_val = X_train_vec[train_index], X_train_vec[valid_index]
    y_tr, y_val = y_train[train_index], y_train[valid_index]

    svm_clf = SVC(kernel='linear')
    svm_clf.fit(X_tr, y_tr)

    val_preds = svm_clf.predict(X_val)
    acc = accuracy_score(y_val, val_preds)
    fold_accuracies.append(acc)

    print(f"[Fold {fold+1}] Validation Accuracy: {acc:.4f}")

print(f"\n📊 Average Validation Accuracy (Baseline - KFold 5): {np.mean(fold_accuracies):.4f}")

# Hyperparameter Tuning

이미 모델 성능이 좋아서 정밀한 전수조사(Grid)보다는 	빠르게 근사 최적값을 찾는 게 더 중요해서 Random Search로 최적 C, penalty 값 찾기


In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# 탐색할 하이퍼파라미터 범위 정의
param_dist = {
    'C': uniform(loc=0.01, scale=10),  # 정규화 파라미터
    'kernel': ['linear'],              # 선형 SVM 고정
}

# RandomizedSearch 정의
random_search = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=param_dist,
    n_iter=20,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

# 학습
random_search.fit(X_train_vec, y_train)

# 최적 결과 출력
print("Best Parameters:", random_search.best_params_)
print("Best CV Accuracy:", random_search.best_score_)

# 최적 모델 추출
best_svm = random_search.best_estimator_

# Cross Validation 2
튜닝 후 최종 모델을 재검증: RandomizedSearch 결과가 얼마나 개선됐는지 비교

In [None]:
kf = KFold(n_splits=5, shuffle=True, random_state=42)
tuned_fold_accuracies = []

for fold, (train_index, valid_index) in enumerate(kf.split(X_train_vec)):
    X_tr, X_val = X_train_vec[train_index], X_train_vec[valid_index]
    y_tr, y_val = y_train[train_index], y_train[valid_index]

    best_svm.fit(X_tr, y_tr)
    val_preds = best_svm.predict(X_val)
    acc = accuracy_score(y_val, val_preds)
    tuned_fold_accuracies.append(acc)

    print(f"[Fold {fold+1}] Validation Accuracy (Tuned): {acc:.4f}")

print(f"\n✅ Average Validation Accuracy (Tuned - KFold 5): {np.mean(tuned_fold_accuracies):.4f}")