In [9]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # 반드시 임포트 되어야 tokenizer가 작동함
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# CSV 로드
df = pd.read_csv("./seller.csv")

# 입력 텍스트 구성
df["text"] = df["회사명"].astype(str) + " " + df["주소"].astype(str)
texts = df["text"].tolist()
labels = df["라벨"].tolist()

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# BERT 사전처리기 및 인코더 로드
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

# BERT 임베딩 함수 정의
def get_bert_embeddings(text_list):
    inputs = bert_preprocess(tf.constant(text_list))
    outputs = bert_encoder(inputs)
    return outputs["pooled_output"].numpy()

# 임베딩 생성
X_train_embed = get_bert_embeddings(X_train)
X_val_embed = get_bert_embeddings(X_val)

# ✅ 라벨을 np.array로 변환
y_train = np.array(y_train)
y_val = np.array(y_val)

# 간단한 MLP 분류기 모델 구성
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

# 모델 컴파일 및 학습
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.fit(X_train_embed, y_train, validation_data=(X_val_embed, y_val), epochs=5, batch_size=8)

Epoch 1/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4816 - loss: 0.7144 - val_accuracy: 0.8077 - val_loss: 0.6281
Epoch 2/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 669us/step - accuracy: 0.7949 - loss: 0.6062 - val_accuracy: 0.8269 - val_loss: 0.5546
Epoch 3/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 679us/step - accuracy: 0.8555 - loss: 0.5438 - val_accuracy: 0.9519 - val_loss: 0.4902
Epoch 4/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 689us/step - accuracy: 0.9046 - loss: 0.4860 - val_accuracy: 0.9712 - val_loss: 0.4378
Epoch 5/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 716us/step - accuracy: 0.9361 - loss: 0.4372 - val_accuracy: 0.9712 - val_loss: 0.3930


<keras.src.callbacks.history.History at 0x399b70c90>

In [10]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text  # 반드시 있어야 tokenizer 작동
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 데이터 로드 및 전처리
df = pd.read_csv("seller.csv")
df["text"] = df["회사명"].astype(str) + " " + df["주소"].astype(str)
texts = df["text"].tolist()
labels = df["라벨"].tolist()

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# BERT 로드
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

# BERT 임베딩 함수
def get_bert_embeddings(text_list):
    inputs = bert_preprocess(tf.constant(text_list))
    outputs = bert_encoder(inputs)
    return outputs["pooled_output"].numpy()

# 임베딩 생성
X_train_embed = get_bert_embeddings(X_train)
X_val_embed = get_bert_embeddings(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

# 모델 구성
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation="sigmoid")
])
model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss="binary_crossentropy",
              metrics=["accuracy"])

# 학습
model.fit(X_train_embed, y_train, validation_data=(X_val_embed, y_val), epochs=5, batch_size=8)

# 모델 저장
model.save("foreign_company_classifier.h5")

# 예측 함수
def predict_label(texts):
    inputs = bert_preprocess(tf.constant(texts))
    embeddings = bert_encoder(inputs)["pooled_output"]
    return model.predict(embeddings).flatten()

# 예측 및 평가
y_pred = predict_label(X_val)
y_pred_label = (y_pred > 0.5).astype(int)
report = classification_report(y_val, y_pred_label)
print(report)

# 파일로 저장
with open("classification_report.txt", "w") as f:
    f.write(report)

Epoch 1/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4669 - loss: 0.8027 - val_accuracy: 0.8654 - val_loss: 0.6577
Epoch 2/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 707us/step - accuracy: 0.6669 - loss: 0.6503 - val_accuracy: 0.9519 - val_loss: 0.5805
Epoch 3/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 692us/step - accuracy: 0.8476 - loss: 0.5748 - val_accuracy: 0.9663 - val_loss: 0.5156
Epoch 4/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 693us/step - accuracy: 0.9218 - loss: 0.5150 - val_accuracy: 0.9760 - val_loss: 0.4604
Epoch 5/5
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 676us/step - accuracy: 0.9463 - loss: 0.4640 - val_accuracy: 0.9663 - val_loss: 0.4139




[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       118
           1       1.00      0.92      0.96        90

    accuracy                           0.97       208
   macro avg       0.97      0.96      0.97       208
weighted avg       0.97      0.97      0.97       208



In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 1. 데이터 로드 및 조합 확장
df = pd.read_csv("seller.csv")

def generate_augmented_text(row):
    company = str(row["회사명"])
    ceo = str(row["법정대표자"])
    address = str(row["주소"])
    return [
        f"{company} {address}",
        f"{company} {ceo}",
        f"{ceo} {address}",
        f"{company}",
        f"{ceo}",
        f"{address}"
    ]

augmented_texts = []
augmented_labels = []

for _, row in df.iterrows():
    texts = generate_augmented_text(row)
    augmented_texts.extend(texts)
    augmented_labels.extend([row["라벨"]] * len(texts))

# 2. 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(
    augmented_texts, augmented_labels, test_size=0.2, random_state=42
)

# 3. BERT 사전처리 및 인코더 로드
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

def get_bert_embeddings(text_list):
    inputs = bert_preprocess(tf.constant(text_list))
    outputs = bert_encoder(inputs)
    return outputs["pooled_output"].numpy()

X_train_embed = get_bert_embeddings(X_train)
X_val_embed = get_bert_embeddings(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

# 4. 분류기 학습
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss="binary_crossentropy",
              metrics=["accuracy"])

model.fit(X_train_embed, y_train, validation_data=(X_val_embed, y_val), epochs=5, batch_size=16)

# 5. 모델 저장
model.save("foreign_company_classifier.h5")

# 6. 예측 및 평가
def predict_label(texts):
    inputs = bert_preprocess(tf.constant(texts))
    embeddings = bert_encoder(inputs)["pooled_output"]
    return model.predict(embeddings).flatten()


y_pred = predict_label(X_val)
y_pred_label = (y_pred > 0.5).astype(int)
report = classification_report(y_val, y_pred_label)
print(report)

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 체크포인트 파일명
files = [
    "X_train_embed.npy", "X_train_label.npy",
    "X_val_embed.npy", "X_val_label.npy"
]
checkpoint_exists = all(os.path.exists(f) for f in files)

if not checkpoint_exists:
    print("✅ 체크포인트 없음: 생성 중...")

    df = pd.read_csv("seller.csv")

    def generate_augmented_text(row):
        company = str(row["회사명"])
        ceo = str(row["법정대표자"])
        address = str(row["주소"])
        return [
            f"{company} {address}",
            f"{company} {ceo}",
            f"{ceo} {address}",
            f"{company}",
            f"{ceo}",
            f"{address}"
        ]

    augmented_texts = []
    augmented_labels = []

    for _, row in df.iterrows():
        texts = generate_augmented_text(row)
        augmented_texts.extend(texts)
        augmented_labels.extend([row["라벨"]] * len(texts))

    X_train, X_val, y_train, y_val = train_test_split(
        augmented_texts, augmented_labels, test_size=0.2, random_state=42
    )

    bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
    bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

    def get_bert_embeddings(text_list):
        inputs = bert_preprocess(tf.constant(text_list))
        outputs = bert_encoder(inputs)
        return outputs["pooled_output"].numpy()

    def save_embeddings(texts, labels, prefix):
        X = get_bert_embeddings(texts)
        y = np.array(labels)
        np.save(f"{prefix}_embed.npy", X)
        np.save(f"{prefix}_label.npy", y)

    save_embeddings(X_train, y_train, "X_train")
    save_embeddings(X_val, y_val, "X_val")
    print("✅ 체크포인트 저장 완료.")
else:
    print("✅ 체크포인트 로드 중...")

# 불러오기
X_train_embed = np.load("X_train_embed.npy")
y_train = np.load("X_train_label.npy")
X_val_embed = np.load("X_val_embed.npy")
y_val = np.load("X_val_label.npy")

# 모델 구성 및 학습
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss="binary_crossentropy",
              metrics=["accuracy"])

print("🚀 모델 학습 시작...")
model.fit(X_train_embed, y_train,
          validation_data=(X_val_embed, y_val),
          epochs=5,
          batch_size=16)

# 저장
model.save("foreign_company_classifier.h5")
print("✅ 모델 저장 완료: foreign_company_classifier.h5")

# 평가
y_pred = model.predict(X_val_embed).flatten()
y_pred_label = (y_pred > 0.5).astype(int)

print("📊 성능 리포트:")
print(classification_report(y_val, y_pred_label))

✅ 체크포인트 없음: 생성 중...


### Check point 저장 추가

In [11]:
import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

# 체크포인트 파일명
files = [
    "X_train_embed.npy", "X_train_label.npy",
    "X_val_embed.npy", "X_val_label.npy"
]
checkpoint_exists = all(os.path.exists(f) for f in files)

if not checkpoint_exists:
    print("✅ 체크포인트 없음: 생성 중...")

    df = pd.read_csv("Balanced_Seller_Dataset.csv")

    def generate_augmented_text(row):
        company = str(row["회사명"])
        ceo = str(row["법정대표자"])
        address = str(row["주소"])
        return [
            f"{company} {address}",
            f"{company} {ceo}",
            f"{ceo} {address}",
            f"{company}",
            f"{ceo}",
            f"{address}"
        ]

    augmented_texts = []
    augmented_labels = []

    for _, row in df.iterrows():
        texts = generate_augmented_text(row)
        augmented_texts.extend(texts)
        augmented_labels.extend([row["라벨"]] * len(texts))

    X_train, X_val, y_train, y_val = train_test_split(
        augmented_texts, augmented_labels, test_size=0.2, random_state=42
    )

    bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
    bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

    def get_bert_embeddings(text_list, batch_size=128):
        all_embeddings = []
        for i in range(0, len(text_list), batch_size):
            batch = text_list[i:i+batch_size]
            inputs = bert_preprocess(tf.constant(batch))
            outputs = bert_encoder(inputs)
            all_embeddings.append(outputs["pooled_output"].numpy())
            gc.collect()  # 메모리 회수
        return np.concatenate(all_embeddings, axis=0)

    def save_embeddings(texts, labels, prefix):
        X = get_bert_embeddings(texts)
        y = np.array(labels)
        np.save(f"{prefix}_embed.npy", X)
        np.save(f"{prefix}_label.npy", y)

    save_embeddings(X_train, y_train, "X_train")
    save_embeddings(X_val, y_val, "X_val")
    print("✅ 체크포인트 저장 완료.")
else:
    print("✅ 체크포인트 로드 중...")

# 불러오기
X_train_embed = np.load("X_train_embed.npy")
y_train = np.load("X_train_label.npy")
X_val_embed = np.load("X_val_embed.npy")
y_val = np.load("X_val_label.npy")

print(y_train[:5])
print(y_val[:5])

# 라벨 컬럼명 제거
if isinstance(y_train[0], str) and y_train[0] == '라벨':
    y_train = y_train[1:]
if isinstance(y_val[0], str) and y_val[0] == '라벨':
    y_val = y_val[1:]

X_train_embed = X_train_embed.astype('float32')
X_val_embed = X_val_embed.astype('float32')
y_train = y_train.astype('float32')
y_val = y_val.astype('float32')

# 모델 구성
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# 체크포인트 콜백 추가
os.makedirs("checkpoint", exist_ok=True)
checkpoint_cb = ModelCheckpoint(
    filepath="checkpoint/bert_epoch{epoch:02d}_valacc{val_accuracy:.2f}.h5",
    save_best_only=True,
    monitor="val_accuracy",
    mode="max",
    verbose=1
)

# 학습 로그 저장 콜백
log_cb = CSVLogger("train_log.csv")

# 모델 학습
print("🚀 모델 학습 시작...")
model.fit(
    X_train_embed, y_train,
    validation_data=(X_val_embed, y_val),
    epochs=5,
    batch_size=16,
    callbacks=[checkpoint_cb, log_cb]
)

# 최종 모델 저장
model.save("foreign_company_classifier.h5")
print("✅ 최종 모델 저장 완료: foreign_company_classifier.h5")

# 평가
y_pred = model.predict(X_val_embed).flatten()
y_pred_label = (y_pred > 0.5).astype(int)

print("📊 성능 리포트:")
print(classification_report(y_val, y_pred_label))

✅ 체크포인트 없음: 생성 중...
✅ 체크포인트 저장 완료.
[0 0 1 1 1]
[1 1 0 1 0]
🚀 모델 학습 시작...
Epoch 1/5
[1m339/467[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 446us/step - accuracy: 0.5328 - loss: 0.7005
Epoch 1: val_accuracy improved from -inf to 0.87366, saving model to checkpoint/bert_epoch01_valacc0.87.h5




[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 738us/step - accuracy: 0.5696 - loss: 0.6850 - val_accuracy: 0.8737 - val_loss: 0.5345
Epoch 2/5
[1m381/467[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 397us/step - accuracy: 0.8905 - loss: 0.5063
Epoch 2: val_accuracy improved from 0.87366 to 0.92666, saving model to checkpoint/bert_epoch02_valacc0.93.h5




[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 548us/step - accuracy: 0.8934 - loss: 0.5005 - val_accuracy: 0.9267 - val_loss: 0.4081
Epoch 3/5
[1m383/467[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 394us/step - accuracy: 0.9374 - loss: 0.3915
Epoch 3: val_accuracy improved from 0.92666 to 0.95343, saving model to checkpoint/bert_epoch03_valacc0.95.h5




[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 542us/step - accuracy: 0.9375 - loss: 0.3879 - val_accuracy: 0.9534 - val_loss: 0.3250
Epoch 4/5
[1m456/467[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 559us/step - accuracy: 0.9495 - loss: 0.3154
Epoch 4: val_accuracy improved from 0.95343 to 0.96681, saving model to checkpoint/bert_epoch04_valacc0.97.h5




[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 704us/step - accuracy: 0.9497 - loss: 0.3150 - val_accuracy: 0.9668 - val_loss: 0.2653
Epoch 5/5
[1m373/467[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 404us/step - accuracy: 0.9630 - loss: 0.2606
Epoch 5: val_accuracy improved from 0.96681 to 0.97270, saving model to checkpoint/bert_epoch05_valacc0.97.h5




[1m467/467[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 564us/step - accuracy: 0.9638 - loss: 0.2586 - val_accuracy: 0.9727 - val_loss: 0.2225




✅ 최종 모델 저장 완료: foreign_company_classifier.h5
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 451us/step
📊 성능 리포트:
              precision    recall  f1-score   support

         0.0       0.97      0.98      0.97       938
         1.0       0.98      0.97      0.97       930

    accuracy                           0.97      1868
   macro avg       0.97      0.97      0.97      1868
weighted avg       0.97      0.97      0.97      1868



In [14]:
import os
import gc
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.utils import class_weight
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger

# 데이터 로드
df = pd.read_csv("Balanced_Seller_Dataset.csv")  # ← 병합된 최종 CSV 사용

# 증강 함수
def generate_augmented_text(row):
    company = str(row["회사명"])
    ceo = str(row["법정대표자"])
    address = str(row["주소"])
    return [
        f"This company, {company}, is located at {address}.",
        f"{ceo} is the CEO of {company}.",
        f"{company} is operated by {ceo} and based at {address}.",
        f"Seller: {company}, CEO: {ceo}.",
        f"{company} sells from {address}.",
        f"{address} is the registered address of {company}."
    ]

# 증강 적용
texts, labels = [], []
for _, row in df.iterrows():
    aug = generate_augmented_text(row)
    texts.extend(aug)
    labels.extend([row["라벨"]] * len(aug))

# Train/Val Split
X_train, X_val, y_train, y_val = train_test_split(texts, labels, test_size=0.2, random_state=42)

# BERT 모델
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

def get_bert_embeddings(text_list, batch_size=128):
    all_embeddings = []
    for i in range(0, len(text_list), batch_size):
        batch = text_list[i:i+batch_size]
        inputs = bert_preprocess(tf.constant(batch))
        outputs = bert_encoder(inputs)
        all_embeddings.append(outputs["pooled_output"].numpy())
        gc.collect()
    return np.concatenate(all_embeddings, axis=0)

# 임베딩 생성
X_train_embed = get_bert_embeddings(X_train)
X_val_embed = get_bert_embeddings(X_val)
y_train = np.array(y_train).astype("float32")
y_val = np.array(y_val).astype("float32")

# 클래스 가중치
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weights = dict(enumerate(weights))

# 심층 모델 정의
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-4),
    loss="binary_crossentropy",
    metrics=["accuracy"]
)

# 콜백 설정
os.makedirs("checkpoint", exist_ok=True)
checkpoint_cb = ModelCheckpoint(
    filepath="checkpoint/bert_epoch{epoch:02d}_valacc{val_accuracy:.2f}.h5",
    save_best_only=True,
    monitor="val_accuracy",
    mode="max",
    verbose=1
)
log_cb = CSVLogger("train_log.csv")

# 학습 시작
model.fit(
    X_train_embed, y_train,
    validation_data=(X_val_embed, y_val),
    epochs=5,
    batch_size=16,
    callbacks=[checkpoint_cb, log_cb],
    class_weight=class_weights
)

# 저장
model.save("foreign_company_classifier_v2.h5")

# 평가
y_pred = model.predict(X_val_embed).flatten()
y_pred_label = (y_pred > 0.5).astype(int)
print(classification_report(y_val, y_pred_label))

Epoch 1/5
[1m533/546[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - accuracy: 0.7683 - loss: 0.5013
Epoch 1: val_accuracy improved from -inf to 0.99450, saving model to checkpoint/bert_epoch01_valacc0.99.h5




[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.7711 - loss: 0.4974 - val_accuracy: 0.9945 - val_loss: 0.1022
Epoch 2/5
[1m514/546[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 1ms/step - accuracy: 0.9860 - loss: 0.0913
Epoch 2: val_accuracy improved from 0.99450 to 0.99541, saving model to checkpoint/bert_epoch02_valacc1.00.h5




[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9863 - loss: 0.0899 - val_accuracy: 0.9954 - val_loss: 0.0385
Epoch 3/5
[1m507/546[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 1ms/step - accuracy: 0.9959 - loss: 0.0296
Epoch 3: val_accuracy did not improve from 0.99541
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9959 - loss: 0.0292 - val_accuracy: 0.9950 - val_loss: 0.0167
Epoch 4/5
[1m516/546[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 1ms/step - accuracy: 0.9985 - loss: 0.0158
Epoch 4: val_accuracy did not improve from 0.99541
[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.9985 - loss: 0.0158 - val_accuracy: 0.9945 - val_loss: 0.0191
Epoch 5/5
[1m539/546[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 1ms/step - accuracy: 0.9970 - loss: 0.0114
Epoch 5: val_accuracy improved from 0.99541 to 0.99679, saving model to checkpoint/bert_



[1m546/546[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.9970 - loss: 0.0114 - val_accuracy: 0.9968 - val_loss: 0.0104




[1m69/69[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 602us/step
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       921
         1.0       1.00      1.00      1.00      1260

    accuracy                           1.00      2181
   macro avg       1.00      1.00      1.00      2181
weighted avg       1.00      1.00      1.00      2181



In [12]:
import tensorflow as tf

print("🧠 Available devices:")
print(tf.config.list_physical_devices('GPU'))

🧠 Available devices:
[]


In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# ✅ 1. CSV 병합 (깨진 줄 무시)
df_main = pd.read_csv("seller_eng.csv")
df_new = pd.read_csv("seller_eng.csv", on_bad_lines="skip")  # <- 핵심
df = pd.concat([df_main, df_new], ignore_index=True).drop_duplicates()

# ✅ 2. 텍스트 증강
def generate_augmented_text(row):
    company = str(row["회사명"])
    ceo = str(row["법정대표자"])
    address = str(row["주소"])
    return [
        f"{company} {address}",
        f"{company} {ceo}",
        f"{ceo} {address}",
        f"{company}",
        f"{ceo}",
        f"{address}"
    ]

augmented_texts, augmented_labels = [], []
for _, row in df.iterrows():
    texts = generate_augmented_text(row)
    augmented_texts.extend(texts)
    augmented_labels.extend([row["라벨"]] * len(texts))

# ✅ 3. 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(
    augmented_texts, augmented_labels, test_size=0.2, random_state=42
)

# ✅ 4. BERT 로드 및 임베딩 생성
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

def get_bert_embeddings(text_list):
    inputs = bert_preprocess(tf.constant(text_list))
    outputs = bert_encoder(inputs)
    return outputs["pooled_output"].numpy()

print("📦 BERT 임베딩 생성 중...")
X_train_embed = get_bert_embeddings(X_train)
X_val_embed = get_bert_embeddings(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

# ✅ 5. 모델 구성 및 학습
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape=(768,)),
    tf.keras.layers.Dropout(0.1),
    tf.keras.layers.Dense(1, activation="sigmoid")
])

model.compile(optimizer=tf.keras.optimizers.Adam(1e-4),
              loss="binary_crossentropy",
              metrics=["accuracy"])

print("🚀 모델 학습 시작...")
model.fit(X_train_embed, y_train,
          validation_data=(X_val_embed, y_val),
          epochs=5,
          batch_size=16)

# ✅ 6. 모델 저장
model.save("foreign_company_classifier.h5")
print("✅ 모델 저장 완료")

# ✅ 7. 평가
y_pred = model.predict(X_val_embed).flatten()
y_pred_label = (y_pred > 0.5).astype(int)
print("📊 평가 결과:")
print(classification_report(y_val, y_pred_label))

📦 BERT 임베딩 생성 중...


: 

In [None]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# 🔸 1. 기존 모델 로드
MODEL_PATH = "foreign_company_classifier.h5"
if not os.path.exists(MODEL_PATH):
    raise FileNotFoundError(f"{MODEL_PATH} not found. Run initial training first.")

print("✅ 기존 모델 로드 중...")
model = tf.keras.models.load_model(MODEL_PATH)

# 🔸 2. 추가 데이터 로드
print("📦 추가 CSV 로딩 중...")
df_additional = pd.read_csv("seller_eng.csv", on_bad_lines="skip").drop_duplicates()

def generate_augmented_text(row):
    company = str(row["회사명"])
    ceo = str(row["법정대표자"])
    address = str(row["주소"])
    return [
        f"{company} {address}",
        f"{company} {ceo}",
        f"{ceo} {address}",
        f"{company}",
        f"{ceo}",
        f"{address}"
    ]

# 🔸 3. 텍스트 증강
aug_texts, aug_labels = [], []
for _, row in df_additional.iterrows():
    texts = generate_augmented_text(row)
    aug_texts.extend(texts)
    aug_labels.extend([row["라벨"]] * len(texts))

# 🔸 4. 학습/검증 분할
X_train, X_val, y_train, y_val = train_test_split(
    aug_texts, aug_labels, test_size=0.2, random_state=42
)

# 🔸 5. 임베딩 생성
print("🔄 BERT 전처리 및 임베딩 중...")
bert_preprocess = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3")
bert_encoder = hub.load("https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/4")

def get_bert_embeddings(text_list):
    inputs = bert_preprocess(tf.constant(text_list))
    outputs = bert_encoder(inputs)
    return outputs["pooled_output"].numpy()

X_train_embed = get_bert_embeddings(X_train)
X_val_embed = get_bert_embeddings(X_val)
y_train = np.array(y_train)
y_val = np.array(y_val)

# 🔸 6. 모델 계속 학습
print("🚀 모델 추가 학습 중...")
model.fit(
    X_train_embed, y_train,
    validation_data=(X_val_embed, y_val),
    epochs=3,
    batch_size=16
)

# 🔸 7. 저장
model.save("foreign_company_classifier.h5")
print("✅ 모델 재저장 완료")

# 🔸 8. 성능 평가
y_pred = model.predict(X_val_embed).flatten()
y_pred_label = (y_pred > 0.5).astype(int)

print("📊 추가 학습 평가 결과:")
print(classification_report(y_val, y_pred_label))

### 추가학습