In [3]:
import pandas as pd

train_df = pd.read_csv("split/ENNI_train.csv")
dev_df   = pd.read_csv("split/ENNI_dev.csv")
test_df  = pd.read_csv("split/ENNI_test.csv")

train_df.head()


Unnamed: 0,group,sub_group,subject,filename
0,SLI,A,725,ENNI/SLI/A/725.cha
1,SLI,A,568,ENNI/SLI/A/568.cha
2,SLI,A,678,ENNI/SLI/A/678.cha
3,SLI,A,825,ENNI/SLI/A/825.cha
4,SLI,A,878,ENNI/SLI/A/878.cha


In [13]:
import re
import pandas as pd
from tqdm import tqdm

# ------------------------------
# 1) CHI 발화 추출 함수
# ------------------------------
def extract_chi_text(path):
    texts = []
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if line.startswith("*CHI:"):
                sent = re.sub(r"\x15.+?\x15", "", line)
                sent = sent.replace("*CHI:", "").strip()
                texts.append(sent)
    return " ".join(texts)


# ------------------------------
# 2) Split된 CSV 불러오기
# ------------------------------
train_df = pd.read_csv("split/ENNI_train.csv")
dev_df   = pd.read_csv("split/ENNI_dev.csv")
test_df  = pd.read_csv("split/ENNI_test.csv")


# ------------------------------
# 3) 텍스트 & 라벨 추가 함수
# ------------------------------
def add_text_and_label(df):
    texts, labels = [], []

    for _, row in tqdm(df.iterrows(), total=len(df)):
        path = row["filename"]
        text = extract_chi_text(path)
        texts.append(text)

        label = 1 if row["group"] == "SLI" else 0
        labels.append(label)

    df["text"] = texts
    df["label"] = labels
    return df


# ------------------------------
# 4) 적용 후 저장
# ------------------------------
train_df = add_text_and_label(train_df)
dev_df   = add_text_and_label(dev_df)
test_df  = add_text_and_label(test_df)

train_df.to_csv("train_ready.csv", index=False)
dev_df.to_csv("dev_ready.csv", index=False)
test_df.to_csv("test_ready.csv", index=False)

print("저장 완료: train_ready.csv, dev_ready.csv, test_ready.csv")


100%|███████████████████████████████████████| 281/281 [00:00<00:00, 1272.78it/s]
100%|█████████████████████████████████████████| 35/35 [00:00<00:00, 1228.07it/s]
100%|█████████████████████████████████████████| 36/36 [00:00<00:00, 1231.95it/s]


저장 완료: train_ready.csv, dev_ready.csv, test_ready.csv


In [14]:
# 필요한 라이브러리
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import joblib
from tqdm import tqdm

# -----------------------------
# 1) 데이터 로드 (train/dev/test)
# -----------------------------
train_df = pd.read_csv("train_ready.csv")   # 이전 단계에서 만든 파일
dev_df   = pd.read_csv("dev_ready.csv")
test_df  = pd.read_csv("test_ready.csv")

train_texts, train_labels = train_df["text"].tolist(), train_df["label"].tolist()
dev_texts, dev_labels     = dev_df["text"].tolist(), dev_df["label"].tolist()
test_texts, test_labels   = test_df["text"].tolist(), test_df["label"].tolist()

# -----------------------------
# 2) 클래스 가중치 계산 (불균형 처리)
# -----------------------------
import numpy as np
classes = np.unique(train_labels)
class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=train_labels)
class_weight_dict = {c: w for c, w in zip(classes, class_weights)}
print("class weights:", class_weight_dict)

# -----------------------------
# 3) 파이프라인 + 하이퍼파라미터 그리드 (LogisticRegression baseline)
# -----------------------------
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=5000, solver="saga"))
])

param_grid = {
    "tfidf__max_df": [0.8, 0.9, 1.0],
    "tfidf__min_df": [1, 2, 3],
    "tfidf__ngram_range": [(1,1), (1,2)],
    "tfidf__max_features": [2000, 5000, None],
    "clf__C": [0.01, 0.1, 1, 10],
    # class weight은 모델 인스턴스 생성시 적용하므로 그리드에 넣지 않음
}

# GridSearchCV: dev(검증) 대신 train+dev 합쳐서 CV 쓰지 말고 dev로 튜닝하려면 scoring 후 수동 비교 가능.
# 여기서는 간단히 train만으로 CV (5-fold) 수행 — 데이터가 그리 큰 편은 아님.
gs = GridSearchCV(pipeline, param_grid, scoring="f1", cv=5, n_jobs=-1, verbose=2)
gs.fit(train_texts, train_labels)

print("Best params:", gs.best_params_)
best_model = gs.best_estimator_

# -----------------------------
# 4) Dev에서 성능 확인 (튜닝 결과 확인)
# -----------------------------
dev_preds = best_model.predict(dev_texts)
print("Dev Accuracy:", accuracy_score(dev_labels, dev_preds))
print("Dev F1:", f1_score(dev_labels, dev_preds))
print(classification_report(dev_labels, dev_preds))
print("Confusion matrix:\n", confusion_matrix(dev_labels, dev_preds))

# -----------------------------
# 5) Test 최종 평가
# -----------------------------
test_preds = best_model.predict(test_texts)
print("Test Accuracy:", accuracy_score(test_labels, test_preds))
print("Test F1:", f1_score(test_labels, test_preds))
print(classification_report(test_labels, test_preds))
print("Confusion matrix:\n", confusion_matrix(test_labels, test_preds))

# -----------------------------
# 6) 모델 저장
# -----------------------------
joblib.dump(best_model, "tfidf_logreg_enni_bestjoblib.pkl")
print("모델 저장 완료: tfidf_logreg_enni_bestjoblib.pkl")


class weights: {0: 0.6162280701754386, 1: 2.650943396226415}
Fitting 5 folds for each of 216 candidates, totalling 1080 fits
Best params: {'clf__C': 10, 'tfidf__max_df': 1.0, 'tfidf__max_features': 2000, 'tfidf__min_df': 2, 'tfidf__ngram_range': (1, 1)}
Dev Accuracy: 0.8
Dev F1: 0.0
              precision    recall  f1-score   support

           0       0.80      1.00      0.89        28
           1       0.00      0.00      0.00         7

    accuracy                           0.80        35
   macro avg       0.40      0.50      0.44        35
weighted avg       0.64      0.80      0.71        35

Confusion matrix:
 [[28  0]
 [ 7  0]]
Test Accuracy: 0.8611111111111112
Test F1: 0.2857142857142857
              precision    recall  f1-score   support

           0       0.86      1.00      0.92        30
           1       1.00      0.17      0.29         6

    accuracy                           0.86        36
   macro avg       0.93      0.58      0.60        36
weighted avg     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 2); total time=   1.3s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.8s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=None, tfidf__min_df=1, tfidf_

[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 2); total time=   1.3s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=3, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   1.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf__ngram_range=(1, 1); total time=   0.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf_

[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=3, tfidf__ngram_range=(1, 1); total time=   0.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=3, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=3, tfidf__ngram_range=(1, 2); total time=   1.4s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=2, tfidf_

[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   1.4s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 2); total time=   1.3s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=3, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf__ngram_range=(1, 1); total time=   0.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf__ngram_range=(1, 2); total time=   1.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=None, tfidf__min_df=2, tfidf_

[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 2); total time=   1.2s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.8s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf_

[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   1.4s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=2, tfidf__ngram_range=(1, 2); total time=   1.3s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=2000, tfidf__min_df=3, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 1); total time=   0.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=2, tfidf__ngram_range=(1, 2); total time=   1.6s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=5000, tfidf__min_df=3, tfidf__ngram_range=(1, 2); total time=   1.5s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=None, tfidf__min_df=1, tfidf__ngram_range=(1, 2); total time=   3.0s
[CV] END clf__C=0.01, tfidf__max_df=0.8, tfidf__max_features=None, tfidf__min_df=3, tfidf_

In [None]:
문제는 코드가 아니라 데이터, 클래스 불균형, 모델 선택에 있음

In [3]:
import numpy as np
import re
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# -------------------------
# 1. 텍스트 전처리
# -------------------------
def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9 ]", " ", text)
    return text.strip()

# -------------------------
# 2. 토크나이징
# -------------------------
def tokenize(text):
    return text.split()

# -------------------------
# 3. GloVe 로딩
# -------------------------
def load_glove(path, dim=100):
    print("🔹 GloVe 임베딩 로딩 중...")
    embeddings = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f):
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    print(f"로드 완료! | 단어 수: {len(embeddings)}")
    return embeddings

# -------------------------
# 4. 문장 → 임베딩 시퀀스 변환
# -------------------------
def sentence_to_vectors(tokens, glove, dim=100, max_len=30):
    vectors = []
    for tok in tokens[:max_len]:
        if tok in glove:
            vectors.append(glove[tok])
        else:
            vectors.append(np.zeros(dim))
    # padding
    while len(vectors) < max_len:
        vectors.append(np.zeros(dim))
    return np.array(vectors)

# -------------------------
# 5. NumPy LSTM 구현
# -------------------------
class NumpyLSTM:
    def __init__(self, input_dim, hidden_dim):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        # Xavier 초기화
        scale = 1.0 / np.sqrt(hidden_dim)

        self.W_f = np.random.randn(hidden_dim, input_dim) * scale
        self.U_f = np.random.randn(hidden_dim, hidden_dim) * scale
        self.b_f = np.zeros((hidden_dim, 1))

        self.W_i = np.random.randn(hidden_dim, input_dim) * scale
        self.U_i = np.random.randn(hidden_dim, hidden_dim) * scale
        self.b_i = np.zeros((hidden_dim, 1))

        self.W_c = np.random.randn(hidden_dim, input_dim) * scale
        self.U_c = np.random.randn(hidden_dim, hidden_dim) * scale
        self.b_c = np.zeros((hidden_dim, 1))

        self.W_o = np.random.randn(hidden_dim, input_dim) * scale
        self.U_o = np.random.randn(hidden_dim, hidden_dim) * scale
        self.b_o = np.zeros((hidden_dim, 1))

    def step(self, x_t, h_prev, c_prev):
        x_t = x_t.reshape(-1, 1)

        f_t = sigmoid(self.W_f @ x_t + self.U_f @ h_prev + self.b_f)
        i_t = sigmoid(self.W_i @ x_t + self.U_i @ h_prev + self.b_i)
        o_t = sigmoid(self.W_o @ x_t + self.U_o @ h_prev + self.b_o)
        c_hat = np.tanh(self.W_c @ x_t + self.U_c @ h_prev + self.b_c)

        c_t = f_t * c_prev + i_t * c_hat
        h_t = o_t * np.tanh(c_t)

        return h_t, c_t
    
    def forward(self, sequence):
        h = np.zeros((self.hidden_dim, 1))
        c = np.zeros((self.hidden_dim, 1))

        for x_t in sequence:
            h, c = self.step(x_t, h, c)
        return h.flatten()   # 마지막 hidden state 반환


def sigmoid(x):
    return 1 / (1 + np.exp(-x))

# -------------------------
# 6. 전체 파이프라인 실행
# -------------------------
# 예시 데이터 (교수님이 말한 단순 데이터)
texts = [
    "This movie is great",
    "I hated this film",
    "What a wonderful story",
    "Terrible acting and bad plot",
]
labels = [1, 0, 1, 0]

texts = [clean_text(t) for t in texts]
tokenized = [tokenize(t) for t in texts]

# GloVe 로딩
glove = load_glove("glove.6B.100d.txt", dim=100)

# 문장 → 벡터 변환
X = np.array([
    sentence_to_vectors(tokens, glove, dim=100, max_len=30)
    for tokens in tokenized
])

# LSTM 인코딩
print("🔹 LSTM 인코딩 중...")
lstm = NumpyLSTM(input_dim=100, hidden_dim=64)

X_encoded = np.array([lstm.forward(sentence) for sentence in X])

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X_encoded, labels, test_size=0.25, random_state=42
)

# -------------------------
# 7. Scikit-learn 분류기 학습
# -------------------------
clf = LogisticRegression()
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
print("\n📌 분류 성능:\n")
print(classification_report(y_test, pred))


🔹 GloVe 임베딩 로딩 중...


400000it [00:06, 61563.62it/s]

로드 완료! | 단어 수: 400000
🔹 LSTM 인코딩 중...

📌 분류 성능:

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       1.0
           1       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [4]:
"""
모델이 잘못된 게 아니라 데이터가 너무 적어서 생기는 현상
1) 데이터 4개 → train 3개, test 1개

test가 1개면 정밀도/재현률 계산이 불가능

스케일에 민감한 Logistic Regression이 3개 데이터로는 학습이 불가

2) LSTM 파라미터는 랜덤 초기화

LSTM은 backpropagation으로 학습해야 의미 있는 representation을 만든다.
하지만 지금은 “학습 없는 LSTM 인코더” → 사실상 랜덤 feature extractor.
"""

SyntaxError: invalid syntax (3943797197.py, line 1)

In [7]:
# GloVe + NumPy LSTM + SVM 파이프라인 (Jupyter에서 실행)
# 전제: glove.6B.100d.txt 가 현재 작업 디렉토리에 있음
# 주의: 대량의 데이터면 메모리/속도 고려해야 함

import re
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, f1_score, accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

# --------------------------
# 1) 전처리 함수
# --------------------------
def clean_text(text):
    if not isinstance(text, str):
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

def tokenize(text):
    return text.split()

# --------------------------
# 2) GloVe 로딩 (이미 다운된 파일 사용)
# --------------------------
def load_glove(path, dim=100):
    glove = {}
    with open(path, "r", encoding="utf-8") as f:
        for line in tqdm(f, desc="Loading GloVe"):
            parts = line.rstrip().split(" ")
            word = parts[0]
            vec = np.asarray(parts[1:], dtype=np.float32)
            if vec.shape[0] == dim:
                glove[word] = vec
    return glove

# 예: glove = load_glove("glove.6B.100d.txt", dim=100)
# EMBDIM = 100

# --------------------------
# 3) 문장 → GloVe 시퀀스 변환
# --------------------------
def sentence_to_seq(tokens, glove, dim=100, max_len=30):
    seq = []
    for t in tokens[:max_len]:
        if t in glove:
            seq.append(glove[t])
        else:
            seq.append(np.zeros(dim, dtype=np.float32))
    # padding
    while len(seq) < max_len:
        seq.append(np.zeros(dim, dtype=np.float32))
    return np.array(seq)  # shape: (max_len, dim)

# --------------------------
# 4) 간단한 NumPy LSTM (forward only)
#    - 학습하지 않고 feature extractor로 사용
# --------------------------
class NumPyLSTMEncoder:
    def __init__(self, input_dim, hidden_dim, seed=42):
        np.random.seed(seed)
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        # 작은 값으로 초기화
        r = 0.1
        self.W_f = np.random.randn(hidden_dim, input_dim) * r
        self.U_f = np.random.randn(hidden_dim, hidden_dim) * r
        self.b_f = np.zeros((hidden_dim,1))

        self.W_i = np.random.randn(hidden_dim, input_dim) * r
        self.U_i = np.random.randn(hidden_dim, hidden_dim) * r
        self.b_i = np.zeros((hidden_dim,1))

        self.W_c = np.random.randn(hidden_dim, input_dim) * r
        self.U_c = np.random.randn(hidden_dim, hidden_dim) * r
        self.b_c = np.zeros((hidden_dim,1))

        self.W_o = np.random.randn(hidden_dim, input_dim) * r
        self.U_o = np.random.randn(hidden_dim, hidden_dim) * r
        self.b_o = np.zeros((hidden_dim,1))

    @staticmethod
    def _sigmoid(x):
        return 1.0 / (1.0 + np.exp(-x))

    def forward(self, seq):  # seq: (T, input_dim)
        h = np.zeros((self.hidden_dim,1))
        c = np.zeros((self.hidden_dim,1))
        for t in range(seq.shape[0]):
            x = seq[t].reshape(-1,1)
            f = self._sigmoid(self.W_f @ x + self.U_f @ h + self.b_f)
            i = self._sigmoid(self.W_i @ x + self.U_i @ h + self.b_i)
            o = self._sigmoid(self.W_o @ x + self.U_o @ h + self.b_o)
            c_hat = np.tanh(self.W_c @ x + self.U_c @ h + self.b_c)
            c = f * c + i * c_hat
            h = o * np.tanh(c)
        return h.flatten()  # (hidden_dim,)

# --------------------------
# 5) 보조 피처: 평균 임베딩, TF-IDF 가중 평균
# --------------------------
def mean_glove_vector(tokens, glove, dim=100):
    vecs = [glove[t] for t in tokens if t in glove]
    if len(vecs) == 0:
        return np.zeros(dim, dtype=np.float32)
    return np.mean(vecs, axis=0)

def tfidf_weighted_avg(texts, glove, dim=100, tfidf=None):
    # tfidf: fitted TfidfVectorizer (if None, will fit inside)
    if tfidf is None:
        tfidf = TfidfVectorizer()
        tfidf.fit(texts)
    X_tfidf = tfidf.transform(texts)  # sparse
    feature_names = tfidf.get_feature_names_out()
    # build word->colidx mapping
    col_idx = {w:i for i,w in enumerate(feature_names)}
    res = []
    for doc_idx, txt in enumerate(texts):
        tokens = txt.split()
        weight_sum = 0.0
        vec = np.zeros(dim, dtype=np.float32)
        for w in tokens:
            if w in col_idx and w in glove:
                w_idx = col_idx[w]
                weight = X_tfidf[doc_idx, w_idx]
                vec += glove[w] * weight
                weight_sum += weight
        if weight_sum > 0:
            vec = vec / weight_sum
        res.append(vec)
    return np.array(res), tfidf

# --------------------------
# 6) 전체 파이프라인 함수
# --------------------------
def build_feature_matrix(texts, glove, lstm_encoder, tfidf=None, dim=100, max_len=30):
    texts_clean = [clean_text(t) for t in texts]
    tokens_list = [tokenize(t) for t in texts_clean]

    # mean glove
    mean_vectors = np.array([mean_glove_vector(tokens, glove, dim) for tokens in tokens_list])

    # tfidf weighted avg (if tfidf provided, reuse)
    tfidf_weighted, tfidf = tfidf_weighted_avg(texts_clean, glove, dim, tfidf=tfidf)

    # LSTM last hidden
    seqs = [sentence_to_seq(tokens, glove, dim=dim, max_len=max_len) for tokens in tokens_list]
    lstm_features = np.array([lstm_encoder.forward(seq) for seq in seqs])  # (N, hidden_dim)

    # length feature
    lengths = np.array([[len(tokens)] for tokens in tokens_list], dtype=np.float32)

    # concatenate features: mean_glove | tfidf_weighted | lstm_hidden | length
    X = np.hstack([mean_vectors, tfidf_weighted, lstm_features, lengths])
    return X, tfidf

# --------------------------
# 7) 예시: 실제 데이터 불러와 실행
# --------------------------
# 파일 이름은 네 환경에 맞게 수정
train_df = pd.read_csv("train_ready.csv")
dev_df   = pd.read_csv("dev_ready.csv")
test_df  = pd.read_csv("test_ready.csv")

# combine train+dev for tfidf stability (optional)
all_train_texts = (train_df["text"].fillna("").tolist() + dev_df["text"].fillna("").tolist())
# load glove
EMB_DIM = 100
glove = load_glove("glove.6B.100d.txt", dim=EMB_DIM)

# LSTM encoder
lstm_encoder = NumPyLSTMEncoder(input_dim=EMB_DIM, hidden_dim=64, seed=123)

# Build features for train
train_texts = train_df["text"].fillna("").tolist()
dev_texts = dev_df["text"].fillna("").tolist()
test_texts = test_df["text"].fillna("").tolist()

# fit TF-IDF on train+dev for more stable idf
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(ngram_range=(1,2), min_df=1)
tfidf.fit(all_train_texts)

X_train, _ = build_feature_matrix(train_texts, glove, lstm_encoder, tfidf=tfidf, dim=EMB_DIM, max_len=30)
X_dev, _   = build_feature_matrix(dev_texts, glove, lstm_encoder, tfidf=tfidf, dim=EMB_DIM, max_len=30)
X_test, _  = build_feature_matrix(test_texts, glove, lstm_encoder, tfidf=tfidf, dim=EMB_DIM, max_len=30)

y_train = train_df["label"].astype(int).values
y_dev   = dev_df["label"].astype(int).values
y_test  = test_df["label"].astype(int).values

# --------------------------
# 8) 스케일링 / 차원축소 (선택)
# --------------------------
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_dev_s = scaler.transform(X_dev)
X_test_s = scaler.transform(X_test)

# (선택) PCA로 차원 줄이기 — 안정성 필요하면 사용
# pca = PCA(n_components=0.95)
# X_train_s = pca.fit_transform(X_train_s)
# X_dev_s = pca.transform(X_dev_s)
# X_test_s = pca.transform(X_test_s)

# --------------------------
# 9) SVM 학습 및 평가
# --------------------------
svm = SVC(kernel="linear", probability=True, class_weight="balanced")
svm.fit(X_train_s, y_train)

def evaluate_model(model, X, y, name=""):
    preds = model.predict(X)
    probs = None
    try:
        probs = model.predict_proba(X)[:,1]
    except:
        pass
    print(f"=== {name} ===")
    print("Accuracy:", accuracy_score(y, preds))
    print("F1:", f1_score(y, preds, zero_division=0))
    print(classification_report(y, preds, zero_division=0))
    print("Confusion matrix:\n", confusion_matrix(y, preds))

evaluate_model(svm, X_train_s, y_train, "Train")
evaluate_model(svm, X_dev_s, y_dev, "Dev")
evaluate_model(svm, X_test_s, y_test, "Test")

# --------------------------
# 10) 저장 (모델, scaler, tfidf)
# --------------------------
import joblib
joblib.dump(svm, "svm_glove_lstm_svm.pkl")
joblib.dump(scaler, "scaler_glove_lstm.pkl")
joblib.dump(tfidf, "tfidf_glove.pkl")
print("저장 완료: svm_glove_lstm_svm.pkl, scaler_glove_lstm.pkl, tfidf_glove.pkl")


Loading GloVe: 400000it [00:06, 60248.29it/s]


=== Train ===
Accuracy: 1.0
F1: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       228
           1       1.00      1.00      1.00        53

    accuracy                           1.00       281
   macro avg       1.00      1.00      1.00       281
weighted avg       1.00      1.00      1.00       281

Confusion matrix:
 [[228   0]
 [  0  53]]
=== Dev ===
Accuracy: 0.8
F1: 0.5333333333333333
              precision    recall  f1-score   support

           0       0.89      0.86      0.87        28
           1       0.50      0.57      0.53         7

    accuracy                           0.80        35
   macro avg       0.69      0.71      0.70        35
weighted avg       0.81      0.80      0.80        35

Confusion matrix:
 [[24  4]
 [ 3  4]]
=== Test ===
Accuracy: 0.6666666666666666
F1: 0.25
              precision    recall  f1-score   support

           0       0.85      0.73      0.79        30
           1       0.

In [None]:
"""
전체 결과를 보면 훈련은 완전 과적합(accuracy 1.0), Dev·Test는 중간 정도 성능(0.8 → 0.66)
즉, 모델 구조 자체는 정상적으로 동작했지만 데이터 적음 + LSTM 단순화 + SVM 조합의 한계 + 과적합 때문에 이런 결과가 나옴
"""

In [15]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
import re

train_df = pd.read_csv("ENNI_train.csv")
dev_df   = pd.read_csv("ENNI_dev.csv")
test_df  = pd.read_csv("ENNI_test.csv")

# 텍스트 파일(.cha → txt 전처리 완료되어 있다고 가정)
train_files = train_df["filename"].tolist()
dev_files   = dev_df["filename"].tolist()
test_files  = test_df["filename"].tolist()

train_y = train_df["group"].tolist()
dev_y   = dev_df["group"].tolist()
test_y  = test_df["group"].tolist()

def load_cha_text(filepath):
    lines = []
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if line.startswith("*CHI"):  # 아동 발화만 사용 (기본 ENNI 관례)
                clean = re.sub(r"[^\w\s']", " ", line)
                clean = re.sub(r"\s+", " ", clean).strip()
                lines.append(clean.lower())
    return " ".join(lines)

def load_glove(glove_path="glove.6B.300d.txt"):
    glove = {}
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype="float32")
            glove[word] = vec
    print("GloVe loaded:", len(glove))
    return glove

glove = load_glove()

class NumPyLSTM:
    def __init__(self, input_dim=300, hidden_dim=128):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        h = hidden_dim
        d = input_dim
        
        # Xavier 초기화
        self.Wf = np.random.randn(h, h + d) / np.sqrt(h + d)
        self.Wi = np.random.randn(h, h + d) / np.sqrt(h + d)
        self.Wc = np.random.randn(h, h + d) / np.sqrt(h + d)
        self.Wo = np.random.randn(h, h + d) / np.sqrt(h + d)
        
    def step(self, x, h_prev, c_prev):
        z = np.concatenate([h_prev, x])  # [hidden_dim + input_dim]

        f = self.sigmoid(self.Wf @ z)
        i = self.sigmoid(self.Wi @ z)
        c_hat = np.tanh(self.Wc @ z)
        c = f * c_prev + i * c_hat
        o = self.sigmoid(self.Wo @ z)
        h = o * np.tanh(c)

        return h, c
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def encode(self, seq):
        h = np.zeros(self.hidden_dim)
        c = np.zeros(self.hidden_dim)
        for x in seq:
            h, c = self.step(x, h, c)
        return h

lstm = NumPyLSTM()

def sentence_to_glove_seq(text, glove):
    seq = []
    for w in text.split():
        if w in glove:
            seq.append(glove[w])
    if len(seq) == 0:  # 빈 문장 방지
        seq.append(np.zeros(300))
    return seq

def encode_file(filepath, glove, lstm):
    text = load_cha_text(filepath)
    seq = sentence_to_glove_seq(text, glove)
    return lstm.encode(seq)

def encode_dataset(file_list, glove, lstm):
    vectors = []
    for f in file_list:
        vec = encode_file(f, glove, lstm)
        vectors.append(vec)
    return np.array(vectors)

train_vec = encode_dataset(train_files, glove, lstm)
dev_vec   = encode_dataset(dev_files, glove, lstm)
test_vec  = encode_dataset(test_files, glove, lstm)

def train_and_evaluate(train_x, train_y, dev_x, dev_y, test_x, test_y):

    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    dev_x   = scaler.transform(dev_x)
    test_x  = scaler.transform(test_x)

    # SVM + Grid Search
    params = {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"],
        "gamma": ["scale", "auto"]
    }

    svm = SVC()
    clf = GridSearchCV(svm, params, cv=3, scoring="accuracy", n_jobs=-1)
    clf.fit(train_x, train_y)

    print("Best Params:", clf.best_params_)

    print("\n=== Dev 성능 ===")
    pred_dev = clf.predict(dev_x)
    print("Accuracy:", accuracy_score(dev_y, pred_dev))
    print(classification_report(dev_y, pred_dev))

    print("\n=== Test 성능 ===")
    pred_test = clf.predict(test_x)
    print("Accuracy:", accuracy_score(test_y, pred_test))
    print(classification_report(test_y, pred_test))

    return clf, scaler

clf, scaler = train_and_evaluate(train_vec, train_y, dev_vec, dev_y, test_vec, test_y)


GloVe loaded: 400000
Best Params: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}

=== Dev 성능 ===
Accuracy: 0.8
              precision    recall  f1-score   support

         SLI       0.00      0.00      0.00         7
          TD       0.80      1.00      0.89        28

    accuracy                           0.80        35
   macro avg       0.40      0.50      0.44        35
weighted avg       0.64      0.80      0.71        35


=== Test 성능 ===
Accuracy: 0.8333333333333334
              precision    recall  f1-score   support

         SLI       0.00      0.00      0.00         6
          TD       0.83      1.00      0.91        30

    accuracy                           0.83        36
   macro avg       0.42      0.50      0.45        36
weighted avg       0.69      0.83      0.76        36



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
"""모델이 SLI를 단 1개도 예측하지 않음

→ Dev/Test에서 SLI precision, recall, F1 = 0

왜 이렇게 됐는가?
1) 데이터 라벨 비율이 극단적으로 불균형

ENNI는 보통 TD가 훨씬 많고 SLI가 적음.
→ 단순 SVM은 “TD만 찍어도 정확도는 높음” 따라서 그 방향으로 학습함.

2) GloVe + NumPy LSTM 인코더의 표현력이 낮음

아동 발화는 짧고 문법이 깨져 있음

GloVe는 adult corpus 기반

NumPy LSTM은 학습되지 않은 랜덤 가중치(=feature extractor quality 낮음)

→ 모델이 SLI의 특징을 전혀 분리 못함.

3) 선형/비선형 SVM은 imbalance 보정 없으면 collapse 발생
"""

In [17]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix

import re

train_df = pd.read_csv("ENNI_train.csv")
dev_df   = pd.read_csv("ENNI_dev.csv")
test_df  = pd.read_csv("ENNI_test.csv")

# 텍스트 파일(.cha → txt 전처리 완료되어 있다고 가정)
train_files = train_df["filename"].tolist()
dev_files   = dev_df["filename"].tolist()
test_files  = test_df["filename"].tolist()

train_y = train_df["group"].tolist()
dev_y   = dev_df["group"].tolist()
test_y  = test_df["group"].tolist()

def load_cha_text(filepath):
    lines = []
    with open(filepath, "r", encoding="utf-8", errors="ignore") as f:
        for line in f:
            if line.startswith("*CHI"):  # 아동 발화만 사용 (기본 ENNI 관례)
                clean = re.sub(r"[^\w\s']", " ", line)
                clean = re.sub(r"\s+", " ", clean).strip()
                lines.append(clean.lower())
    return " ".join(lines)

def load_glove(glove_path="glove.6B.300d.txt"):
    glove = {}
    with open(glove_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            word = parts[0]
            vec = np.array(parts[1:], dtype="float32")
            glove[word] = vec
    print("GloVe loaded:", len(glove))
    return glove

glove = load_glove()

class NumPyLSTM:
    def __init__(self, input_dim=300, hidden_dim=128):
        self.input_dim = input_dim
        self.hidden_dim = hidden_dim
        
        h = hidden_dim
        d = input_dim
        
        # Xavier 초기화
        self.Wf = np.random.randn(h, h + d) / np.sqrt(h + d)
        self.Wi = np.random.randn(h, h + d) / np.sqrt(h + d)
        self.Wc = np.random.randn(h, h + d) / np.sqrt(h + d)
        self.Wo = np.random.randn(h, h + d) / np.sqrt(h + d)
        
    def step(self, x, h_prev, c_prev):
        z = np.concatenate([h_prev, x])  # [hidden_dim + input_dim]

        f = self.sigmoid(self.Wf @ z)
        i = self.sigmoid(self.Wi @ z)
        c_hat = np.tanh(self.Wc @ z)
        c = f * c_prev + i * c_hat
        o = self.sigmoid(self.Wo @ z)
        h = o * np.tanh(c)

        return h, c
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def encode(self, seq):
        h = np.zeros(self.hidden_dim)
        c = np.zeros(self.hidden_dim)
        for x in seq:
            h, c = self.step(x, h, c)
        return h

lstm = NumPyLSTM()

def sentence_to_glove_seq(text, glove):
    seq = []
    for w in text.split():
        if w in glove:
            seq.append(glove[w])
    if len(seq) == 0:  # 빈 문장 방지
        seq.append(np.zeros(300))
    return seq

def encode_file(filepath, glove, lstm):
    text = load_cha_text(filepath)
    seq = sentence_to_glove_seq(text, glove)
    return lstm.encode(seq)

def encode_dataset(file_list, glove, lstm):
    vectors = []
    for f in file_list:
        vec = encode_file(f, glove, lstm)
        vectors.append(vec)
    return np.array(vectors)

train_vec = encode_dataset(train_files, glove, lstm)
dev_vec   = encode_dataset(dev_files, glove, lstm)
test_vec  = encode_dataset(test_files, glove, lstm)

def train_and_eval(train_x, train_y, dev_x, dev_y, test_x, test_y):

    # 🔹 GridSearchCV 파라미터에 class_weight='balanced' 추가
    params = {
        "C": [0.1, 1, 10],
        "kernel": ["linear", "rbf"],
        "gamma": ["scale", "auto"],
        "class_weight": ["balanced"]   # ⭐ 핵심 추가
    }

    svm = SVC()
    clf = GridSearchCV(
        svm,
        params,
        cv=3,
        scoring='f1_macro',    # 불균형일 때 macro f1이 더 적절
        n_jobs=-1
    )

    clf.fit(train_x, train_y)

    print("Best Params:", clf.best_params_)
    best_model = clf.best_estimator_

    print("\n=== Dev 성능 ===")
    dev_pred = best_model.predict(dev_x)
    print(classification_report(dev_y, dev_pred))
    print(confusion_matrix(dev_y, dev_pred))

    print("\n=== Test 성능 ===")
    test_pred = best_model.predict(test_x)
    print(classification_report(test_y, test_pred))
    print(confusion_matrix(test_y, test_pred))

    return best_model
clf = train_and_eval(train_vec, train_y, dev_vec, dev_y, test_vec, test_y)


GloVe loaded: 400000
Best Params: {'C': 0.1, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}

=== Dev 성능 ===
              precision    recall  f1-score   support

         SLI       0.27      0.57      0.36         7
          TD       0.85      0.61      0.71        28

    accuracy                           0.60        35
   macro avg       0.56      0.59      0.54        35
weighted avg       0.73      0.60      0.64        35

[[ 4  3]
 [11 17]]

=== Test 성능 ===
              precision    recall  f1-score   support

         SLI       0.24      0.67      0.35         6
          TD       0.89      0.57      0.69        30

    accuracy                           0.58        36
   macro avg       0.57      0.62      0.52        36
weighted avg       0.78      0.58      0.64        36

[[ 4  2]
 [13 17]]


In [None]:
"""
LI recall이 0% → 67%로 폭증

Dev: SLI recall 57%

Test: SLI recall 67%

즉, **SLI를 실제로 '잡기 시작'**했어.
이건 불균형 데이터 개선에서 반드시 필요한 핵심 변화임.

❌ 하지만 TD 성능이 떨어짐

TD recall이 100 → 57로 감소

Accuracy도 0.8 → 0.58로 내려감

즉, 모델이 SLI를 잡는 대신 TD를 SLI로 더 오판하는 방향
"""

In [18]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# =========================
# 1. GloVe 로드
# =========================
def load_glove(glove_path):
    embeddings = {}
    with open(glove_path, "r", encoding="utf8") as f:
        for line in tqdm(f, desc="Loading GloVe"):
            values = line.rstrip().split(" ")
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings


# =========================
# 2. 전처리 및 문장 → 임베딩 변환
# =========================
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return text.split()

def sentence_to_embedding(sentence, glove, dim=300):
    tokens = preprocess(sentence)
    vecs = [glove[t] for t in tokens if t in glove]
    if len(vecs) == 0:
        return np.zeros((1, dim))
    return np.array(vecs)


# =========================
# 3. NumPy LSTM Encoder
# =========================
class NumpyLSTM:
    def __init__(self, input_dim, hidden_dim):
        self.hidden_dim = hidden_dim

        self.Wf = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bf = np.zeros((hidden_dim, 1))

        self.Wi = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bi = np.zeros((hidden_dim, 1))

        self.Wc = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bc = np.zeros((hidden_dim, 1))

        self.Wo = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bo = np.zeros((hidden_dim, 1))

    def step(self, x, h_prev, c_prev):
        concat = np.vstack((h_prev, x))

        f = self.sigmoid(self.Wf @ concat + self.bf)
        i = self.sigmoid(self.Wi @ concat + self.bi)
        c_hat = np.tanh(self.Wc @ concat + self.bc)
        c = f * c_prev + i * c_hat
        o = self.sigmoid(self.Wo @ concat + self.bo)
        h = o * np.tanh(c)
        return h, c

    def encode(self, sequence):
        h = np.zeros((self.hidden_dim, 1))
        c = np.zeros((self.hidden_dim, 1))
        for word_vec in sequence:
            x = word_vec.reshape(-1, 1)
            h, c = self.step(x, h, c)
        return h.reshape(-1)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


# =========================
# 4. Dataset → 벡터로 변환
# =========================
def encode_dataset(texts, glove, lstm):
    vectors = []
    for sent in texts:
        seq = sentence_to_embedding(sent, glove)
        h = lstm.encode(seq)
        vectors.append(h)
    return np.array(vectors)


# =========================
# 5. SVM 학습 (rbf 고정 + balanced)
# =========================
def train_svm_balanced(train_x, train_y, dev_x, dev_y, test_x, test_y):

    print("스케일러 적용 중...")
    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    dev_x = scaler.transform(dev_x)
    test_x = scaler.transform(test_x)

    # GridSearch: rbf 커널 고정
    param_grid = {
        "C": [0.01, 0.1, 1, 10],
        "gamma": ["scale", "auto"]
    }

    svm = SVC(
        kernel="rbf",
        class_weight="balanced"
    )

    grid = GridSearchCV(
        svm,
        param_grid,
        cv=5,
        n_jobs=-1,
        verbose=2
    )
    grid.fit(train_x, train_y)

    print("\nBest Params:", grid.best_params_)
    best_svm = grid.best_estimator_

    # -------------------
    print("\n=== Dev 성능 ===")
    pred_dev = best_svm.predict(dev_x)
    print(classification_report(dev_y, pred_dev))
    print(confusion_matrix(dev_y, pred_dev))

    # -------------------
    print("\n=== Test 성능 ===")
    pred_test = best_svm.predict(test_x)
    print(classification_report(test_y, pred_test))
    print(confusion_matrix(test_y, pred_test))

    return best_svm, scaler, grid.best_params__


# =========================
# 6. 실제 실행
# =========================

glove_path = "glove.6B.300d.txt"
glove = load_glove(glove_path)
print("GloVe loaded:", len(glove))

# LSTM 인코더
lstm = NumpyLSTM(input_dim=300, hidden_dim=128)

# 텍스트 → 벡터
train_vec = encode_dataset(train_texts, glove, lstm)
dev_vec   = encode_dataset(dev_texts, glove, lstm)
test_vec  = encode_dataset(test_texts, glove, lstm)

# SVM 학습 실행
svm_model, scaler, params = train_svm_balanced(
    train_vec, train_y,
    dev_vec, dev_y,
    test_vec, test_y
)


Loading GloVe: 400000it [00:17, 22491.44it/s]


GloVe loaded: 400000
스케일러 적용 중...
Fitting 5 folds for each of 8 candidates, totalling 40 fits

Best Params: {'C': 0.01, 'gamma': 'scale'}

=== Dev 성능 ===
              precision    recall  f1-score   support

         SLI       0.20      1.00      0.33         7
          TD       0.00      0.00      0.00        28

    accuracy                           0.20        35
   macro avg       0.10      0.50      0.17        35
weighted avg       0.04      0.20      0.07        35

[[ 7  0]
 [28  0]]

=== Test 성능 ===
              precision    recall  f1-score   support

         SLI       0.17      1.00      0.29         6
          TD       0.00      0.00      0.00        30

    accuracy                           0.17        36
   macro avg       0.08      0.50      0.14        36
weighted avg       0.03      0.17      0.05        36

[[ 6  0]
 [30  0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


AttributeError: 'GridSearchCV' object has no attribute 'best_params__'

[CV] END .................................C=0.01, gamma=auto; total time=   2.5s
[CV] END .................................C=0.01, gamma=auto; total time=   0.0s
[CV] END .................................C=0.01, gamma=auto; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END .................................C=0.1, gamma=scale; total time=   0.0s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.0s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.0s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.0s
[CV] END ..................................C=0.1, gamma=auto; total time=   0.0s
[CV] END ...................

In [19]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

# =========================
# 1. GloVe 로드
# =========================
def load_glove(glove_path):
    embeddings = {}
    with open(glove_path, "r", encoding="utf8") as f:
        for line in tqdm(f, desc="Loading GloVe"):
            values = line.rstrip().split(" ")
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings


# =========================
# 2. 전처리 및 문장 → 임베딩 변환
# =========================
def preprocess(text):
    text = text.lower()
    text = re.sub(r"[^a-z0-9\s]", " ", text)
    return text.split()

def sentence_to_embedding(sentence, glove, dim=300):
    tokens = preprocess(sentence)
    vecs = [glove[t] for t in tokens if t in glove]
    if len(vecs) == 0:
        return np.zeros((1, dim))
    return np.array(vecs)


# =========================
# 3. NumPy LSTM Encoder
# =========================
class NumpyLSTM:
    def __init__(self, input_dim, hidden_dim):
        self.hidden_dim = hidden_dim

        self.Wf = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bf = np.zeros((hidden_dim, 1))

        self.Wi = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bi = np.zeros((hidden_dim, 1))

        self.Wc = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bc = np.zeros((hidden_dim, 1))

        self.Wo = np.random.randn(hidden_dim, input_dim + hidden_dim) * 0.1
        self.bo = np.zeros((hidden_dim, 1))

    def step(self, x, h_prev, c_prev):
        concat = np.vstack((h_prev, x))

        f = self.sigmoid(self.Wf @ concat + self.bf)
        i = self.sigmoid(self.Wi @ concat + self.bi)
        c_hat = np.tanh(self.Wc @ concat + self.bc)
        c = f * c_prev + i * c_hat
        o = self.sigmoid(self.Wo @ concat + self.bo)
        h = o * np.tanh(c)
        return h, c

    def encode(self, sequence):
        h = np.zeros((self.hidden_dim, 1))
        c = np.zeros((self.hidden_dim, 1))
        for word_vec in sequence:
            x = word_vec.reshape(-1, 1)
            h, c = self.step(x, h, c)
        return h.reshape(-1)

    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))


# =========================
# 4. Dataset → 벡터로 변환
# =========================
def encode_dataset(texts, glove, lstm):
    vectors = []
    for sent in texts:
        seq = sentence_to_embedding(sent, glove)
        h = lstm.encode(seq)
        vectors.append(h)
    return np.array(vectors)

# --------------------------------------------
# SVM 학습 함수 (개선된 version a)
# --------------------------------------------
def train_svm_balanced(train_x, train_y, dev_x, dev_y, test_x, test_y):
    print("스케일러 적용 중...")
    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    dev_x   = scaler.transform(dev_x)
    test_x  = scaler.transform(test_x)

    # GridSearch 범위 확장 (collapse 방지)
    param_grid = {
        'C': [0.1, 1, 3, 10],
        'kernel': ['linear'],
        'gamma': ['scale'],
        'class_weight': ['balanced']
    }

    svm = SVC()

    grid = GridSearchCV(
        svm,
        param_grid,
        cv=5,
        scoring='f1_macro',
        verbose=1,
        n_jobs=-1
    )

    grid.fit(train_x, train_y)

    print("\nBest Params:", grid.best_params_)

    best_svm = grid.best_estimator_

    # ----------------------------
    # Dev 성능
    # ----------------------------
    print("\n=== Dev 성능 ===")
    pred_dev = best_svm.predict(dev_x)
    print(classification_report(dev_y, pred_dev))
    print(confusion_matrix(dev_y, pred_dev))

    # ----------------------------
    # Test 성능
    # ----------------------------
    print("\n=== Test 성능 ===")
    pred_test = best_svm.predict(test_x)
    print(classification_report(test_y, pred_test))
    print(confusion_matrix(test_y, pred_test))

    return best_svm, scaler, grid.best_params_

# =========================
# 6. 실제 실행
# =========================

glove_path = "glove.6B.300d.txt"
glove = load_glove(glove_path)
print("GloVe loaded:", len(glove))

# LSTM 인코더
lstm = NumpyLSTM(input_dim=300, hidden_dim=128)

# 텍스트 → 벡터
train_vec = encode_dataset(train_texts, glove, lstm)
dev_vec   = encode_dataset(dev_texts, glove, lstm)
test_vec  = encode_dataset(test_texts, glove, lstm)

# SVM 학습 실행
svm_model, scaler, params = train_svm_balanced(
    train_vec, train_y,
    dev_vec, dev_y,
    test_vec, test_y
)


Loading GloVe: 400000it [00:17, 22343.44it/s]


GloVe loaded: 400000
스케일러 적용 중...
Fitting 5 folds for each of 4 candidates, totalling 20 fits

Best Params: {'C': 10, 'class_weight': 'balanced', 'gamma': 'scale', 'kernel': 'linear'}

=== Dev 성능 ===
              precision    recall  f1-score   support

         SLI       0.33      0.43      0.38         7
          TD       0.85      0.79      0.81        28

    accuracy                           0.71        35
   macro avg       0.59      0.61      0.59        35
weighted avg       0.74      0.71      0.73        35

[[ 3  4]
 [ 6 22]]

=== Test 성능 ===
              precision    recall  f1-score   support

         SLI       0.10      0.17      0.12         6
          TD       0.81      0.70      0.75        30

    accuracy                           0.61        36
   macro avg       0.45      0.43      0.44        36
weighted avg       0.69      0.61      0.65        36

[[ 1  5]
 [ 9 21]]


In [None]:
"""
지금 결과에서 문제점 요약
✔ TD(다수 클래스)는 잘 맞춘다

Dev F1: 0.81

Test F1: 0.75

❗ SLI(소수 클래스) Recall이 낮다

Dev Recall: 0.43

Test Recall: 0.17

즉, SLI를 더 많이 잡아내도록 모델 편향을 보정해야 함.
"""

In [23]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm


# ================================
# 1. GloVe 불러오기
# ================================
def load_glove(path):
    glove = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=400000):
            values = line.split()
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            glove[word] = vec
    print("GloVe loaded:", len(glove))
    return glove


# ================================
# 2. 간단한 NumPy LSTM 구현
# ================================
class NumpyLSTM:
    def __init__(self, input_dim, hidden_dim):
        self.hidden_dim = hidden_dim
        self.W = np.random.randn(input_dim + hidden_dim, 4 * hidden_dim) * 0.1
        self.b = np.zeros((4 * hidden_dim,))

    def forward(self, x):
        h = np.zeros((self.hidden_dim,))
        c = np.zeros((self.hidden_dim,))
        for t in range(x.shape[0]):
            xt = x[t]
            concat = np.concatenate([h, xt])
            gates = concat @ self.W + self.b

            i = 1 / (1 + np.exp(-gates[:self.hidden_dim]))         # input gate
            f = 1 / (1 + np.exp(-gates[self.hidden_dim:2*self.hidden_dim]))  # forget gate
            o = 1 / (1 + np.exp(-gates[2*self.hidden_dim:3*self.hidden_dim])) # output gate
            g = np.tanh(gates[3*self.hidden_dim:])                          # candidate

            c = f * c + i * g
            h = o * np.tanh(c)

        return h


# ================================
# 3. 텍스트 → GloVe → LSTM 인코딩
# ================================
def encode_sentence(text, glove, lstm, max_len=50):
    tokens = text.lower().split()
    vecs = []

    for tok in tokens[:max_len]:
        if tok in glove:
            vecs.append(glove[tok])
        else:
            vecs.append(np.zeros((300,)))

    if len(vecs) == 0:
        vecs.append(np.zeros((300,)))

    return lstm.forward(np.array(vecs))


def encode_dataset(texts, glove, lstm):
    return np.array([encode_sentence(t, glove, lstm) for t in texts])


# ================================
# 4. SVM(RBF 중심) + class_weight=balanced 탐색
# ================================
def train_svm_rbf(train_x, train_y, dev_x, dev_y, test_x, test_y):

    print("스케일러 적용 중...")
    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    dev_x   = scaler.transform(dev_x)
    test_x  = scaler.transform(test_x)

    param_grid = {
        "C":     [0.1, 1, 10, 50],
        "gamma": ["scale", 0.01, 0.001],
        "kernel": ["rbf"],
        "class_weight": ["balanced"]
    }

    svm = SVC()

    grid = GridSearchCV(
        svm,
        param_grid,
        cv=5,
        n_jobs=-1
    )
    grid.fit(train_x, train_y)

    best_svm = grid.best_estimator_

    print("\nBest Params:", grid.best_params_)

    print("\n=== Dev 성능 ===")
    pred_dev = best_svm.predict(dev_x)
    print(classification_report(dev_y, pred_dev))
    print(confusion_matrix(dev_y, pred_dev))

    print("\n=== Test 성능 ===")
    pred_test = best_svm.predict(test_x)
    print(classification_report(test_y, pred_test))
    print(confusion_matrix(test_y, pred_test))

    return best_svm, scaler, grid.best_params_



# ================================
# 5. 실행 파트
# ================================
# 예시: 실제 사용 시 아래 변수만 데이터셋에 맞게 교체
glove_path = "glove.6B.300d.txt"

glove = load_glove(glove_path)
lstm = NumpyLSTM(input_dim=300, hidden_dim=128)

# 여러분의 분할된 텍스트/레이블 사용
# train_texts, train_y
# dev_texts, dev_y
# test_texts, test_y

train_vec = encode_dataset(train_texts, glove, lstm)
dev_vec   = encode_dataset(dev_texts, glove, lstm)
test_vec  = encode_dataset(test_texts, glove, lstm)

svm_model, scaler, best_params = train_svm_rbf(
    train_vec, train_y,
    dev_vec, dev_y,
    test_vec, test_y
)


100%|████████████████████████████████| 400000/400000 [00:17<00:00, 22526.14it/s]


GloVe loaded: 400000
스케일러 적용 중...

Best Params: {'C': 50, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

=== Dev 성능 ===
              precision    recall  f1-score   support

         SLI       0.00      0.00      0.00         7
          TD       0.78      0.89      0.83        28

    accuracy                           0.71        35
   macro avg       0.39      0.45      0.42        35
weighted avg       0.62      0.71      0.67        35

[[ 0  7]
 [ 3 25]]

=== Test 성능 ===
              precision    recall  f1-score   support

         SLI       0.43      0.50      0.46         6
          TD       0.90      0.87      0.88        30

    accuracy                           0.81        36
   macro avg       0.66      0.68      0.67        36
weighted avg       0.82      0.81      0.81        36

[[ 3  3]
 [ 4 26]]


In [None]:
"""
성능 해석(중요)

✔ TD는 dev/test 둘 다 0.85~0.93의 매우 좋은 성능
✔ SLI는 test에서 1/6 = 17%, dev는 0%

즉, 모델이 SLI를 거의 구별 못함 → 원인:

1) 클래스 불균형 (SLI가 너무 적음)

각각 train/dev/test에서 SLI가 6~7개 수준

SVM + 작은 샘플 → 거의 불가능

2) 문장 길이 짧거나 단서가 적으면 LSTM 인코더가 SLI를 못 잡음

Word embedding 계열에서는 특히 문제됨.
"""

In [24]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from tqdm import tqdm


# ================================
# 1. GloVe 불러오기
# ================================
def load_glove(path):
    glove = {}
    with open(path, 'r', encoding='utf-8') as f:
        for line in tqdm(f, total=400000):
            values = line.split()
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            glove[word] = vec
    print("GloVe loaded:", len(glove))
    return glove


# ================================
# 2. 간단한 NumPy LSTM 구현
# ================================
class NumpyLSTM:
    def __init__(self, input_dim, hidden_dim):
        self.hidden_dim = hidden_dim
        self.W = np.random.randn(input_dim + hidden_dim, 4 * hidden_dim) * 0.1
        self.b = np.zeros((4 * hidden_dim,))

    def forward(self, x):
        h = np.zeros((self.hidden_dim,))
        c = np.zeros((self.hidden_dim,))
        for t in range(x.shape[0]):
            xt = x[t]
            concat = np.concatenate([h, xt])
            gates = concat @ self.W + self.b

            i = 1 / (1 + np.exp(-gates[:self.hidden_dim]))         # input gate
            f = 1 / (1 + np.exp(-gates[self.hidden_dim:2*self.hidden_dim]))  # forget gate
            o = 1 / (1 + np.exp(-gates[2*self.hidden_dim:3*self.hidden_dim])) # output gate
            g = np.tanh(gates[3*self.hidden_dim:])                          # candidate

            c = f * c + i * g
            h = o * np.tanh(c)

        return h


# ================================
# 3. 텍스트 → GloVe → LSTM 인코딩
# ================================
def encode_sentence(text, glove, lstm, max_len=50):
    tokens = text.lower().split()
    vecs = []

    for tok in tokens[:max_len]:
        if tok in glove:
            vecs.append(glove[tok])
        else:
            vecs.append(np.zeros((300,)))

    if len(vecs) == 0:
        vecs.append(np.zeros((300,)))

    return lstm.forward(np.array(vecs))


def encode_dataset(texts, glove, lstm):
    return np.array([encode_sentence(t, glove, lstm) for t in texts])


# ================================
# 4. SVM(RBF 중심) + class_weight=balanced 탐색
# ================================

def train_svm_smote(train_x, train_y, dev_x, dev_y, test_x, test_y):

    print("스케일러 적용 중...")
    scaler = StandardScaler()
    train_x = scaler.fit_transform(train_x)
    dev_x   = scaler.transform(dev_x)
    test_x  = scaler.transform(test_x)

    # 1️⃣ SMOTE 적용 (SLI 데이터 증가)
    print("SMOTE 적용 중...")
    smote = SMOTE(k_neighbors=3, random_state=42)
    train_x, train_y = smote.fit_resample(train_x, train_y)

    print("SMOTE 후 클래스 분포:", np.bincount(train_y))

    # 2️⃣ 파라미터 그리드
    param_grid = {
        "C": [0.1, 1, 10, 50],
        "gamma": ["scale", 0.01, 0.001],
        "kernel": ["rbf"],
        "class_weight": ["balanced"]
    }

    svm = SVC()

    grid = GridSearchCV(
        svm,
        param_grid,
        cv=5,
        n_jobs=-1
    )
    grid.fit(train_x, train_y)

    best_svm = grid.best_estimator_

    print("\nBest Params:", grid.best_params_)

    # 3️⃣ Dev 성능 확인
    print("\n=== Dev 성능 ===")
    pred_dev = best_svm.predict(dev_x)
    print(classification_report(dev_y, pred_dev))
    print(confusion_matrix(dev_y, pred_dev))

    # 4️⃣ Test 성능 확인
    print("\n=== Test 성능 ===")
    pred_test = best_svm.predict(test_x)
    print(classification_report(test_y, pred_test))
    print(confusion_matrix(test_y, pred_test))

    return best_svm, scaler, grid.best_params_


# ================================
# 5. 실행 파트
# ================================
# 예시: 실제 사용 시 아래 변수만 데이터셋에 맞게 교체
glove_path = "glove.6B.300d.txt"

glove = load_glove(glove_path)
lstm = NumpyLSTM(input_dim=300, hidden_dim=128)

# 여러분의 분할된 텍스트/레이블 사용
# train_texts, train_y
# dev_texts, dev_y
# test_texts, test_y

train_vec = encode_dataset(train_texts, glove, lstm)
dev_vec   = encode_dataset(dev_texts, glove, lstm)
test_vec  = encode_dataset(test_texts, glove, lstm)

svm_model, scaler, best_params = train_svm_rbf(
    train_vec, train_y,
    dev_vec, dev_y,
    test_vec, test_y
)


100%|████████████████████████████████| 400000/400000 [00:17<00:00, 22589.34it/s]


GloVe loaded: 400000
스케일러 적용 중...

Best Params: {'C': 50, 'class_weight': 'balanced', 'gamma': 0.01, 'kernel': 'rbf'}

=== Dev 성능 ===
              precision    recall  f1-score   support

         SLI       0.25      0.14      0.18         7
          TD       0.81      0.89      0.85        28

    accuracy                           0.74        35
   macro avg       0.53      0.52      0.51        35
weighted avg       0.70      0.74      0.71        35

[[ 1  6]
 [ 3 25]]

=== Test 성능 ===
              precision    recall  f1-score   support

         SLI       0.50      0.67      0.57         6
          TD       0.93      0.87      0.90        30

    accuracy                           0.83        36
   macro avg       0.71      0.77      0.73        36
weighted avg       0.86      0.83      0.84        36

[[ 4  2]
 [ 4 26]]


In [None]:
"""
✔ 결과가 Dev/Test에서 다르게 나온 건 데이터 분할 문제일 가능성이 매우 높음
✔ Test 기준 SLI 성능은 꽤 괜찮아짐
✔ 다음으로는 Stratified split이 가장 중요
"""

Loading GloVe: 400000it [00:18, 22200.64it/s]


GloVe loaded: 400000


Loading GloVe: 400000it [00:18, 22156.00it/s]


TypeError: train_svm_rbf() takes 2 positional arguments but 6 were given