<a href="https://colab.research.google.com/github/hyunbini/Project/blob/main/SW_Ensemble_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Install Library and Preparation

In [None]:
pip install scikit-learn xgboost catboost pandas

In [None]:
# Preparing the Colab Environment: Installing Java + Konlpy + mecab-ko
!apt-get -qq update
!apt-get -y -qq install openjdk-11-jdk curl git
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["PATH"] += ":" + os.path.join(os.environ["JAVA_HOME"], "bin")

!pip -q install JPype1==1.5.0 konlpy==0.6.0

In [None]:
# Delete an existing folde
!rm -rf Mecab-ko-for-Google-Colab

# 2. Cloned again the Repository
!git clone https://github.com/SOMJANG/Mecab-ko-for-Google-Colab.git

In [None]:
cd Mecab-ko-for-Google-Colab/

In [None]:
# Run the installation script
!bash install_mecab-ko_on_colab_light_220429.sh

In [None]:
# Install the hanspell to use preprocessing
!pip install git+https://github.com/ssut/py-hanspell.git

Import Library

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from konlpy.tag import Mecab
from tqdm import tqdm
from collections import Counter
from hanspell import spell_checker
from google.colab import drive
drive.mount('/content/drive')

In [None]:
mecab = Mecab()

Preprocessing Function

In [None]:
# Remove special characters
def clean_text(text):
    text = re.sub(r'<[^>]+>', ' ', text)
    text = re.sub(r'http\S+|www\S+', ' ', text)
    text = re.sub(r'\S+@\S+', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Count the typo
def count_typos(text):
    try:
        result = spell_checker.check(text)
        return result.errors
    except:
        return 0

# Check the presence of emoticons
def has_emoji_or_special(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"
        u"\U0001F300-\U0001F5FF"
        u"\U0001F680-\U0001F6FF"
        u"\U0001F1E0-\U0001F1FF"
        "]+", flags=re.UNICODE)
    special_pattern = re.compile(r'[^\w\sㄱ-ㅎ가-힣]')
    return bool(emoji_pattern.search(text)) or bool(special_pattern.search(text))

# Frequency of adverbs
def adverb_ratio(text):
    tokens = mecab.pos(text)
    return len([w for w, pos in tokens if pos == 'MAG']) / len(tokens) if tokens else 0
# Check zipf's law score
def zipf_score(text):
    words = [w for w in mecab.morphs(text) if len(w) > 1]
    freqs = Counter(words)
    ranks = np.arange(1, len(freqs)+1)
    freqs_sorted = np.array(sorted(freqs.values(), reverse=True))
    if len(freqs_sorted) < 2:
        return 0
    return np.corrcoef(np.log(ranks), np.log(freqs_sorted))[0, 1]

In [None]:
def preprocess_dataframe(df, text_col='full_text'):
    tqdm.pandas()
    df['text_clean'] = df[text_col].progress_apply(clean_text)
    df['typo_count'] = df['text_clean'].progress_apply(count_typos)
    df['has_emoji_special'] = df['text_clean'].progress_apply(has_emoji_or_special)
    df['adv_ratio'] = df['text_clean'].progress_apply(adverb_ratio)
    df['zipf_corr'] = df['text_clean'].progress_apply(zipf_score)
    return df

In [None]:
def mecab_tokenizer(text):
        return mecab.morphs(text)

In [None]:
# Function defined by mecab morpheme analyzer-based tokenizer
def tokenize(df, text_col):
  vectorizer = TfidfVectorizer(
      tokenizer=mecab_tokenizer,
      ngram_range=(1, 2),
      max_features=10000
  )
  vector_data = vectorizer.fit_transform(df[text_col])
  return vector_data

Data I/O Function

In [None]:
def load_data():
  # Load the data
  train_df = pd.read_csv('/content/drive/MyDrive/Data/train.csv', encoding='utf-8-sig')
  train_df = preprocess_dataframe(train_df, text_col='full_text')

  # Encoding the label
  label_encoder = LabelEncoder()
  y = label_encoder.fit_transform(train_df['generated'])
  X = tokenize(train_df,'full_text')

  # Split the train / test dataset
  X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
  return X_train, y_train, X_val, y_val, label_encoder

Modeling Function

In [None]:
def train_and_evaluate_model(X_train, y_train, X_val, y_val):
# Learning individual models
  xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
  xgb_model.fit(X_train, y_train)

  cat_model = CatBoostClassifier(verbose=0)
  cat_model.fit(X_train, y_train)

  rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
  rf_model.fit(X_train, y_train)

# Soft Voting Forecast (validation train Set)
  xgb_probs = xgb_model.predict_proba(X_val)
  cat_probs = cat_model.predict_proba(X_val)
  rf_probs = rf_model.predict_proba(X_val)
  avg_probs = (xgb_probs + cat_probs + rf_probs) / 3
  ensemble_preds = np.argmax(avg_probs, axis=1)

# Print Evaluation Score
  print(classification_report(y_val, ensemble_preds))
  print("Accuracy:", accuracy_score(y_val, ensemble_preds))
  print("ROC-AUC:", roc_auc_score(y_val, avg_probs[:, 1]))
  return xgb_model, cat_model, rf_model

In [None]:
# Prediction of results for test sets
def test(xgb_model,cat_model,rf_model,label_encoder):
  test_df = pd.read_csv('/content/drive/MyDrive/Data/test.csv', encoding='utf-8-sig')
  X_test_final = tokenize(test_df,'paragraph_text')

  xgb_test_probs = xgb_model.predict_proba(X_test_final)
  cat_test_probs = cat_model.predict_proba(X_test_final)
  rf_test_probs = rf_model.predict_proba(X_test_final)

  avg_test_probs = (xgb_test_probs + cat_test_probs + rf_test_probs) / 3
  test_preds = np.argmax(avg_test_probs, axis=1)
  test_labels = label_encoder.inverse_transform(test_preds)

  # Save the results to a submission file
  submission = pd.read_csv('/content/drive/MyDrive/Data/submission.csv')
  submission['generated'] = test_labels
  submission.to_csv('/content/drive/MyDrive/Data/submission.csv', index=False)

  print("Finish")

Main Function

In [None]:
def main():
  train_x,train_y,test_x,test_y,label = load_data()
  model_xgb,model_cat,model_rf = train_and_evaluate_model(train_x,train_y,test_x,test_y)
  test(model_xgb,model_rf,rf_model,label)

In [None]:
if _name_ == "_main_":
  main()