In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import pandas as pd

In [2]:

# Check if train_data is available; otherwise, train data must be simulated
try:
    # Load the train data uploaded earlier
    train_file_path = '/content/drive/MyDrive/modu/DLTON/custom_dataset/final_data_add_v1_241208.csv'
    train_data = pd.read_csv(train_file_path)
except FileNotFoundError:
    raise ValueError("Train data is not available. Please upload labeled training data.")

# Prepare training data (assuming 'text' and 'class' columns exist)
train_texts = train_data['conversation']
train_labels = train_data['class']

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(train_texts)
y = train_labels

# Split into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_val)
evaluation_report = classification_report(y_val, y_pred, output_dict=True)


In [3]:

# Check if train_data is available; otherwise, train data must be simulated
try:
    # Load the train data uploaded earlier
    test_file_path = '/content/drive/MyDrive/modu/DLTON/custom_dataset/test_cleansed_241208.csv'
    test_data = pd.read_csv(test_file_path)
except FileNotFoundError:
    raise ValueError("Train data is not available. Please upload labeled training data.")

# Apply classification to test data
test_texts = test_data['text']
test_tfidf = vectorizer.transform(test_texts)

# Predict probabilities for each class
test_probabilities = model.predict_proba(test_tfidf)

test_data['class'] = model.predict(test_tfidf)

# Map to numeric labels
category_mapping = {
    "협박 대화": "00",
    "갈취 대화": "01",
    "직장 내 괴롭힘 대화": "02",
    "기타 괴롭힘 대화": "03",
    "일반 대화": "04"
}
test_data['category_label'] = test_data['class'].map(category_mapping)

# Save the results
output_file_path = '/content/drive/MyDrive/modu/DLTON/custom_dataset/test_tfidf_classification.csv'
test_data.to_csv(output_file_path, index=False, encoding='utf-8-sig')

evaluation_report, output_file_path

({'갈취 대화': {'precision': 0.9667994687915007,
   'recall': 0.9578947368421052,
   'f1-score': 0.9623265036351619,
   'support': 760.0},
  '기타 괴롭힘 대화': {'precision': 0.9627192982456141,
   'recall': 0.9766407119021134,
   'f1-score': 0.969630038652678,
   'support': 899.0},
  '일반 대화': {'precision': 0.9895561357702349,
   'recall': 0.9768041237113402,
   'f1-score': 0.9831387808041504,
   'support': 776.0},
  '직장 내 괴롭힘 대화': {'precision': 0.9678217821782178,
   'recall': 0.9923857868020305,
   'f1-score': 0.9799498746867168,
   'support': 788.0},
  '협박 대화': {'precision': 0.9764216366158114,
   'recall': 0.9552238805970149,
   'f1-score': 0.9657064471879286,
   'support': 737.0},
  'accuracy': 0.9722222222222222,
  'macro avg': {'precision': 0.972663664320276,
   'recall': 0.9717898479709209,
   'f1-score': 0.9721503289933272,
   'support': 3960.0},
  'weighted avg': {'precision': 0.9723267972990752,
   'recall': 0.9722222222222222,
   'f1-score': 0.9721988369308427,
   'support': 3960.0}},

In [4]:
# Create Submission.csv
submission_data = pd.DataFrame({
    'idx': test_data['idx'],  # Assuming `idx` column exists in test data
    'target': test_probabilities.argmax(axis=1)  # Predicted class index
})

# Save Submission.csv
submission_file_path = '/content/drive/MyDrive/modu/DLTON/custom_dataset/test_tfidf_Submission.csv'
submission_data.to_csv(submission_file_path, index=False, encoding='utf-8-sig')

evaluation_report, output_file_path, submission_file_path

({'갈취 대화': {'precision': 0.9667994687915007,
   'recall': 0.9578947368421052,
   'f1-score': 0.9623265036351619,
   'support': 760.0},
  '기타 괴롭힘 대화': {'precision': 0.9627192982456141,
   'recall': 0.9766407119021134,
   'f1-score': 0.969630038652678,
   'support': 899.0},
  '일반 대화': {'precision': 0.9895561357702349,
   'recall': 0.9768041237113402,
   'f1-score': 0.9831387808041504,
   'support': 776.0},
  '직장 내 괴롭힘 대화': {'precision': 0.9678217821782178,
   'recall': 0.9923857868020305,
   'f1-score': 0.9799498746867168,
   'support': 788.0},
  '협박 대화': {'precision': 0.9764216366158114,
   'recall': 0.9552238805970149,
   'f1-score': 0.9657064471879286,
   'support': 737.0},
  'accuracy': 0.9722222222222222,
  'macro avg': {'precision': 0.972663664320276,
   'recall': 0.9717898479709209,
   'f1-score': 0.9721503289933272,
   'support': 3960.0},
  'weighted avg': {'precision': 0.9723267972990752,
   'recall': 0.9722222222222222,
   'f1-score': 0.9721988369308427,
   'support': 3960.0}},