# Train Model


In [None]:
import pandas as pd
import numpy as np
import re
import ast
from collections import Counter
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, LGBMClassifier
from sklearn.metrics import mean_absolute_error, r2_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from pythainlp import word_tokenize
import gdown

In [None]:
url = "https://drive.google.com/uc?id=1TWaXhd9-3PqjusF3lgyA_UKI2qwOE7mU"
output = "gdf_public_impact.csv"

# Download dataset from Google Drive
gdown.download(url, output, quiet=False)

# Load the CSV file
cleaned_df = pd.read_csv(output)

print("Loading embedding model...")
emb_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')
print("Embedding model loaded.")


In [None]:
url = "https://drive.google.com/uc?id=1dgpJ9eOMrluSrpQl5tvEKxC8ioa2zH2S"
output = "llm_score.csv"
gdown.download(url, output, quiet=False)
df_train = pd.read_csv(output)

In [None]:
# Extract distinctive risk-related keywords from labeled data
def extract_distinctive_keywords(df, top_n=50, min_count=5):
    print(f"Analyzing {len(df)} records to identify distinctive risk keywords...")

    # 1. Split into two groups: High-risk vs Others
    # Here we assume: score >= 8 OR urgency == "high" â†’ High-risk group.
    # Adjust this logic according to your actual dataset.
    mask_high = (df["ai_score"] >= 8) | (df["ai_urgency"].str.lower() == "high")

    df_high = df[mask_high]
    df_normal = df[~mask_high]

    print(f"   High-urgency rows: {len(df_high)}")
    print(f"   Normal rows: {len(df_normal)}")

    # 2. Tokenization function
    def get_tokens(text_series):
        text_blob = " ".join(text_series.dropna().astype(str).tolist())
        # Use PyThaiNLP tokenizer (newmm engine)
        tokens = word_tokenize(text_blob, engine="newmm", keep_whitespace=False)
        # Remove very short tokens
        return [t for t in tokens if len(t) > 1]

    # 3. Tokenize both groups and count frequencies
    print("   Tokenizing and counting...")
    tokens_high = get_tokens(df_high["comment"])
    tokens_normal = get_tokens(df_normal["comment"])

    cnt_high = Counter(tokens_high)
    cnt_normal = Counter(tokens_normal)

    total_high = sum(cnt_high.values())
    total_normal = sum(cnt_normal.values())

    # 4. Compute distinctiveness score:
    #    ratio = P(word | high urgency) / P(word | normal)
    #    Higher ratio â†’ word is much more common in high-risk comments.
    word_scores = {}
    all_words = set(cnt_high.keys()) | set(cnt_normal.keys())

    for word in all_words:

        # Skip rare words (noise)
        if cnt_high[word] + cnt_normal[word] < min_count:
            continue

        # Add-1 smoothing to avoid division-by-zero
        p_high = (cnt_high[word] + 1) / (total_high + 1)
        p_normal = (cnt_normal[word] + 1) / (total_normal + 1)

        risk_ratio = p_high / p_normal

        # Keep only strongly distinctive words (e.g., >2x)
        if risk_ratio > 2.0:
            word_scores[word] = risk_ratio

    # 5. Sort by highest distinctiveness score
    sorted_keywords = sorted(word_scores.items(), key=lambda x: x[1], reverse=True)
    final_keywords = [w for w, score in sorted_keywords[:top_n]]

    print(f"Distinctive keywords identified: {len(final_keywords)}")
    print(f"Top 10 example keywords: {final_keywords[:10]}")

    return final_keywords


# Example usage
risk_keywords = extract_distinctive_keywords(df_train, top_n=50)


In [None]:
def create_features(df, keywords, is_training=True):
    print(f"Generating features for {len(df)} rows...")

    # A. Text embeddings (semantic representation)
    # Note: For large datasets (e.g., 100k rows), this step may take 15â€“30 minutes on CPU.
    # Load embedding model if not already loaded.
    if 'emb_model' not in globals():
        print("Loading embedding model...")
        global emb_model
        from sentence_transformers import SentenceTransformer
        emb_model = SentenceTransformer('paraphrase-multilingual-MiniLM-L12-v2')

    embeddings = emb_model.encode(
        df['comment'].fillna('').astype(str).tolist(),
        batch_size=64,
        show_progress_bar=True
    )

    # B. Keyword-based features (count risky keywords)
    def count_keywords(text):
        if not isinstance(text, str):
            return 0
        return sum(1 for w in keywords if w in text)

    # Number of risky keywords per comment
    keyword_counts = np.array([count_keywords(t) for t in df['comment']]).reshape(-1, 1)

    # Combine all features (embedding vectors + keyword count)
    X = np.hstack([embeddings, keyword_counts])

    return X


# Prepare training features
print("Preparing training data...")
X_train_full = create_features(df_train, risk_keywords, is_training=True)

# Prepare training targets
y_score = df_train['ai_score'].fillna(0).values

# Convert urgency level to numeric labels
urgency_map = {'low': 0, 'medium': 1, 'high': 2}
y_class = (
    df_train['ai_urgency']
    .str.strip()
    .str.lower()
    .map(urgency_map)
    .fillna(0)
    .astype(int)
    .values
)

# Train-test split (regression target)
X_train, X_test, y_train_r, y_test_r = train_test_split(
    X_train_full, y_score, test_size=0.2, random_state=42
)

# Train-test split (classification target)
_, _, y_train_c, y_test_c = train_test_split(
    X_train_full, y_class, test_size=0.2, random_state=42
)


In [None]:
print("Training Regression Model...")
reg_model = LGBMRegressor(n_estimators=500, learning_rate=0.05, random_state=42)
reg_model.fit(X_train, y_train_r)

# # à¸§à¸±à¸”à¸œà¸¥
# y_pred_r = reg_model.predict(X_test)
# y_pred_r = np.clip(y_pred_r, 0, 10) # à¸šà¸±à¸‡à¸„à¸±à¸šà¸„à¹ˆà¸²à¹ƒà¸«à¹‰à¸­à¸¢à¸¹à¹ˆà¹ƒà¸™à¸Šà¹ˆà¸§à¸‡ 0-10

# print(f"MAE: {mean_absolute_error(y_test_r, y_pred_r):.2f}")
# print(f"R2 Score: {r2_score(y_test_r, y_pred_r):.4f}")

# # Plot
# plt.figure(figsize=(5, 5))
# plt.scatter(y_test_r, y_pred_r, alpha=0.5, color='royalblue')
# plt.plot([0, 10], [0, 10], 'r--')
# plt.title('Actual vs Predicted Score')
# plt.show()

In [None]:
print("Training Classification Model...")
cls_model = LGBMClassifier(n_estimators=500, learning_rate=0.05, class_weight='balanced', random_state=42)
cls_model.fit(X_train, y_train_c)

# # à¸§à¸±à¸”à¸œà¸¥
# y_pred_c = cls_model.predict(X_test)

# print(classification_report(y_test_c, y_pred_c, target_names=['Low', 'Medium', 'High']))

# # Confusion Matrix
# cm = confusion_matrix(y_test_c, y_pred_c)
# plt.figure(figsize=(5, 4))
# sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Low', 'Med', 'High'], yticklabels=['Low', 'Med', 'High'])
# plt.title('Confusion Matrix')
# plt.show()

In [None]:
# Applying to Large Dataset 
# Sampling & Prediction
print("Loading and sampling data...")

# 1. Sampling
SAMPLE_SIZE = 200000  # Adjust training size here

if len(cleaned_df) > SAMPLE_SIZE:
    print(f"   Data is larger than {SAMPLE_SIZE}. Sampling now...")
    # random_state=42 ensures deterministic sampling
    clean_df = cleaned_df.sample(n=SAMPLE_SIZE, random_state=42).reset_index(drop=True)
else:
    print("   Dataset is smaller than sample target. Using full dataset.")
    clean_df = cleaned_df.copy()

print(f"   Ready to process: {len(clean_df):,} rows")

# Feature generation and prediction
print("   Generating features (embedding computation may take time)...")

# Generate features (updated: no mlb, returns X only)
X_clean = create_features(clean_df, risk_keywords, is_training=False)

print("   Predicting...")

# A. Predict urgency class (Low/Medium/High)
class_map = {0: 'Low', 1: 'Medium', 2: 'High'}
clean_df['predicted_class_id'] = cls_model.predict(X_clean)
clean_df['predicted_urgency'] = clean_df['predicted_class_id'].map(class_map)

# B. Predict urgency score (0â€“10)
clean_df['predicted_score'] = reg_model.predict(X_clean).round(1)
clean_df['predicted_score'] = np.clip(clean_df['predicted_score'], 0, 10)


In [None]:
# Analysis plots
print("\nGenerating analysis plots...")

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot 1: Score distribution
sns.histplot(
    data=clean_df,
    x='predicted_score',
    bins=20,
    kde=True,
    color='skyblue',
    ax=axes[0]
)
axes[0].set_title('Distribution of Predicted Scores (0â€“10)')
axes[0].set_xlabel('Predicted Score')
axes[0].set_ylabel('Count')

# Plot 2: Urgency class counts
sns.countplot(
    data=clean_df,
    x='predicted_urgency',
    order=['Low', 'Medium', 'High'],
    palette='viridis',
    ax=axes[1]
)
axes[1].set_title('Count of Predicted Urgency Classes')
axes[1].set_xlabel('Urgency Class')
axes[1].set_ylabel('Count')

# Plot 3: Consistency check (score vs class)
sns.boxplot(
    data=clean_df,
    x='predicted_urgency',
    y='predicted_score',
    order=['Low', 'Medium', 'High'],
    palette='viridis',
    ax=axes[2]
)
axes[2].set_title('Consistency Check: Score vs Urgency Class')
axes[2].set_ylabel('Predicted Score')

plt.tight_layout()
plt.show()

In [None]:
# Hybrid Scoring Calculation
print("\nCalculating final hybrid score...")

# 1. Handle missing values
clean_df['count_reopen_log'] = clean_df['count_reopen_log'].fillna(0)
clean_df['public_impact'] = clean_df['public_impact'].fillna(0)

# 2. Normalize 'reopen_log' into a 0â€“10 scale
# Assumption: log >= 4 (â‰ˆ 50+ reopen events) is treated as max score (10).
clean_df['score_reopen_norm'] = (clean_df['count_reopen_log'] / 4.0 * 10).clip(upper=10)

# 3. Normalize 'public_impact' (0â€“100) into a 0â€“10 scale
clean_df['score_impact_norm'] = (clean_df['public_impact'] / 100 * 10).clip(upper=10)

# 4. Compute hybrid score using weighted sum
# Formula: AI Text (60%) + Reopen (10%) + Impact (30%)
w_ai = 0.6
w_reopen = 0.1
w_impact = 0.3

clean_df['final_hybrid_score'] = (
    (clean_df['predicted_score'] * w_ai) +
    (clean_df['score_reopen_norm'] * w_reopen) +
    (clean_df['score_impact_norm'] * w_impact)
).round(2)

# 5. Re-classify into final priority levels
def categorize_hybrid(score):
    if score >= 8.0:
        return 'Critical'
    elif score >= 6.0:
        return 'High'
    elif score >= 4.0:
        return 'Medium'
    else:
        return 'Low'

clean_df['final_priority_class'] = clean_df['final_hybrid_score'].apply(categorize_hybrid)

In [None]:
# Visualization: Evaluation of hybrid scoring
print("Generating hybrid analysis plots...")

fig, axes = plt.subplots(1, 3, figsize=(18, 6))

# Plot 1: Comparison between AI-only and hybrid scores
sns.kdeplot(clean_df['predicted_score'], color='skyblue',
            label='AI Text Only', fill=True, ax=axes[0])
sns.kdeplot(clean_df['final_hybrid_score'], color='orange',
            label='Hybrid Score', fill=True, alpha=0.5, ax=axes[0])
axes[0].legend()
axes[0].set_title('Impact of Metadata on Scores')
axes[0].set_xlabel('Score (0â€“10)')

# Plot 2: Relationship between reopen count and hybrid score
sample_n = min(2000, len(clean_df))
plot_data = clean_df.sample(n=sample_n, random_state=42)

sns.scatterplot(
    data=plot_data,
    x='final_hybrid_score',
    y='count_reopen_log',
    hue='final_priority_class',
    palette={'Low':'green', 'Medium':'orange', 'High':'red', 'Critical':'darkred'},
    alpha=0.6,
    ax=axes[1]
)
axes[1].set_title('Hybrid Score vs. Reopen Log')

# Plot 3: Distribution of final priority classes
sns.countplot(
    x='final_priority_class',
    data=clean_df,
    order=['Low', 'Medium', 'High', 'Critical'],
    palette={'Low':'green', 'Medium':'orange', 'High':'red', 'Critical':'darkred'},
    ax=axes[2]
)
axes[2].set_title('Final Priority Distribution')

plt.tight_layout()
plt.show()

In [None]:
# Identify cases where metadata significantly increases urgency
clean_df['score_lift'] = clean_df['final_hybrid_score'] - clean_df['predicted_score']
hidden_gems = clean_df.sort_values(by='score_lift', ascending=False).head(10)

print("\nTop 10 Cases Boosted by Metadata:")
cols_show = [
    'comment',
    'predicted_score',
    'count_reopen_log',
    'public_impact',
    'final_hybrid_score',
    'final_priority_class'
]
print(hidden_gems[cols_show])

# clean_df.to_csv('final_scored_hybrid.csv', index=False)


In [None]:
# File Save
import os
folder = "data"

file_to_save = "result.csv"
clean_df.to_csv(os.path.join(folder, file_to_save), index=False)
print(f"\nðŸ’¾ Saved to {folder}/{file_to_save}")