In [33]:
import pandas as pd
import numpy as np
import json
import time
import lightgbm as lgb
import matplotlib.pyplot as plt
import optuna

from sklearn.model_selection import KFold
from collections import Counter
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

from utils.system import *
from metric import get_metric

#### Prep Data

In [34]:
# Read in data
article = pd.read_csv(get_data() / 'human_annotations_all_8000_overall.csv')
art_emb = pd.read_parquet(get_data() / 'bert_article_emb.parquet.brotli')
sentence_emb = pd.read_parquet(get_data() / 'bert_sentence_cosine.parquet.brotli')  
sent = pd.read_parquet(get_data() / 'bert_sentiment.parquet.brotli')
art_cos = pd.read_parquet(get_data() / 'bert_art_cosine.parquet.brotli')  
emotion = pd.read_parquet(get_data() / 'bert_emotion.parquet.brotli')
topic = pd.read_parquet(get_data() / 'lda_topic.parquet.brotli')
n_gram = pd.read_parquet(get_data() / 'n_gram.parquet.brotli')
lex_div = pd.read_parquet(get_data() / 'lexical_div.parquet.brotli')
readability = pd.read_parquet(get_data() / 'readability.parquet.brotli')
time_data = pd.read_parquet(get_data() / 'time.parquet.brotli')
lexicon = pd.read_parquet(get_data() / 'bert_word_cosine.parquet.brotli')

In [35]:
# Merge all data together
merged_emb = (pd.merge(art_emb, sentence_emb, on='id', how='inner')
              .merge(sent, on='id', how='inner')
              .merge(art_cos, on='id', how='inner')
              .merge(emotion, on='id', how='inner')
              .merge(lex_div, on='id', how='inner')
              .merge(topic, on='id', how='inner')
              .merge(n_gram, on='id', how='inner')
              .merge(time_data, on='id', how='inner'))

In [36]:
# Stack article embeddings + sentence embeddings together into one array
merged_emb['comb_emb'] = merged_emb.apply(lambda row: [*row['bert_emb_art'], *row['bert_emb_min'], *row['bert_emb_max']], axis=1)

In [37]:
# Retrieve top ____ words
lexicon = lexicon.head(800)
lexicon = lexicon.reset_index(level=0, drop=True)

#### Word Count Feature

In [38]:
# Count
merged_emb['word_count'] = merged_emb['cleaned_article'].apply(lambda x: len(x.split()))

#### Undersample

In [39]:
# Undersample
undersample = merged_emb.sort_values('overall_label')
df_class_0 = undersample[undersample['overall_label'] == 0]
df_class_1 = undersample[undersample['overall_label'] == 1]
n_samples = min(len(df_class_0), len(df_class_1))
df_class_0_under = df_class_0.sample(n_samples)
df_class_1_under = df_class_1.sample(n_samples)
merged_undersample = pd.concat([df_class_0_under, df_class_1_under], axis=0)
merged_undersample = merged_undersample.sample(frac=1).reset_index(drop=True)

#### Tfidf vectorize

In [40]:
# Vectorize Text
tfidf_vectorizer = TfidfVectorizer(max_features=10000, ngram_range=(1, 2))
tfidf_features = tfidf_vectorizer.fit_transform(merged_undersample['cleaned_article'])
tfidf_df = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df = tfidf_df.drop('count', axis=1)

In [41]:
merged_undersample = pd.concat([merged_undersample, tfidf_df], axis=1)

#### Convert Lexicon Dictionary to Numerical Format

In [42]:
def create_count_features(article, lexicon):
    word_counts = Counter(article.split())
    features = [word_counts[word] for word in lexicon]
    return features

In [43]:
lexicon_list = lexicon['word'].tolist()
# For count features
merged_undersample['count_features'] = merged_undersample['cleaned_article'].apply(lambda x: create_count_features(x, lexicon_list))

#### Flatten word embeddings into columns

In [44]:
# Function to flatten an array-containing column
def flatten_column(dataframe, column_name):
    # Flatten the column
    flattened_features = np.stack(dataframe[column_name].values)
    # Create a new DataFrame from the flattened features
    flattened_df = pd.DataFrame(flattened_features, columns=[f'{column_name}_{i}' for i in range(flattened_features.shape[1])])
    return flattened_df

In [45]:
# Flatten each array-containing column
flattened_emb_all = flatten_column(merged_undersample, 'bert_emb_art')
flattened_emb_max = flatten_column(merged_undersample, 'bert_emb_max')
flattened_emb_min = flatten_column(merged_undersample, 'bert_emb_min')
flattened_lexicon = flatten_column(merged_undersample, 'count_features')

# Concatenate the flattened DataFrames with the rest of the DataFrame
merged_undersample = pd.concat([flattened_emb_all, flattened_emb_max, flattened_emb_min, flattened_lexicon,
                                merged_undersample.drop(columns=['comb_emb', 'text', 'cleaned_article', 'bert_emb_art', 'bert_emb_max', 'bert_emb_min', 'count_features'])], axis=1)

#### Train/Val/Test Split

In [46]:
# Splitting the data into features and label
X = merged_undersample.drop('overall_label', axis=1)
y = merged_undersample['overall_label']

# First split into training (80%) and test (20%)
X_train_test, X_test, y_train_test, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split training (80%) into training (70%) and validation (30%)
X_train, X_val, y_train, y_val = train_test_split(X_train_test, y_train_test, test_size=0.3, random_state=42)

In [47]:
# Create the LightGBM dataset
dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)
dval = lgb.Dataset(X_val, label=y_val, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)

#### Optuna Hyperparameter Optimization

In [48]:
fixed_params = {
        "feature_pre_filter": False,
        'device_type': 'gpu', 
        'gpu_platform_id': 1, 
        'gpu_device_id': 0, 
        'gpu_use_dp': True,
        'objective': 'binary',
        'metric': 'binary_logloss',
        'verbose': -1,
        'boosting': 'gbdt'
}

In [49]:
def objective(trial):
    # Tunable hyperparameters
    param_tuning = {
        "max_depth": trial.suggest_int("max_depth", 1, 50),
        "learning_rate": round(trial.suggest_float("learning_rate", 1e-5, 0.1), 3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 300),
        "feature_fraction": round(trial.suggest_float("feature_fraction", 0.1, 1.0), 3),
        "min_gain_to_split": round(trial.suggest_float("min_gain_to_split", 0, 1.0), 3),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 20, 200),
        "lambda_l1": round(trial.suggest_float("lambda_l1", 0, 5.0), 3),
        "lambda_l2": round(trial.suggest_float("lambda_l2", 0, 5.0), 3),
    }    

    # Combine fixed and tunable hyperparameters
    params = {**param_tuning, **fixed_params}

    # Train the model
    clf = lgb.train(params, dtrain, valid_sets=[dtrain, dval], num_boost_round=1000, callbacks=[lgb.early_stopping(25)])

    # Calculate and return the performance metric
    y_pred = clf.predict(X_test)
    trial.set_user_attr('predictions', y_pred)
    # metric = log_loss(y_test, y_pred)
    y_pred_binary = (y_pred >= 0.5).astype(int)
    f1 = get_metric(y_test, y_pred_binary)
    metric = f1.loc[f1['Metric'] == 'F1 Score (Positive)']['Value'].values[0]
    return metric

In [None]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

# Get the best parameters
best_params = study.best_params
print("-" * 60)
print("Best parameters:", best_params)

[I 2024-01-09 14:21:50,218] A new study created in memory with name: no-name-ec228034-e7bb-4384-a16b-c31f8e36d904


Training until validation scores don't improve for 25 rounds


In [None]:
top_trial = sorted(study.trials, key=lambda t: t.value, reverse=True)[:1]
top_pred = [trial.user_attrs['predictions'] for trial in top_trial]
avg_pred = np.mean([pred for pred in top_pred], axis=0)
y_pred_binary = (avg_pred >= 0.50).astype(int)

#### Train Best Model

In [None]:
# Define parameters
final_params = {**best_params, **fixed_params}

# Train the model
lgb_early_stop = lgb.early_stopping(25)
clf = lgb.train(final_params, dtrain, valid_sets=[dtrain, dval], num_boost_round=1000, callbacks=[lgb_early_stop])

In [None]:
y_pred = clf.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

In [None]:
metric = get_metric(y_test, y_pred_binary)
metric

#### Train Stack Model

In [None]:
y_train_pred = clf.predict(X_train)
y_train_pred_binary = (y_train_pred >= 0.5).astype(int)

y_val_pred = clf.predict(X_val)
y_val_pred_binary = (y_val_pred >= 0.5).astype(int)

In [None]:
# Create the LightGBM dataset
dtrain = lgb.Dataset(X_train, label=y_train_pred_binary, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)
dval = lgb.Dataset(X_val, label=y_val_pred_binary, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)

In [None]:
# Define parameters
final_params = {**best_params, **fixed_params}

# Train the model
lgb_early_stop = lgb.early_stopping(25)
clf = lgb.train(final_params, dtrain, valid_sets=[dtrain, dval], num_boost_round=1000, callbacks=[lgb_early_stop])

In [None]:
y_pred = clf.predict(X_test)
y_pred_binary = (y_pred >= 0.5).astype(int)

In [None]:
metric = get_metric(y_test, y_pred_binary)
metric

#### Plot Gain/Split

In [None]:
gain = clf.feature_importance(importance_type='gain')
split = clf.feature_importance(importance_type='split')
feature_names = clf.feature_name()
importance = pd.DataFrame({
    'Feature Name': feature_names,
    'Gain': gain,
    'Split': split
})
importance_gain = importance.sort_values(by='Gain', ascending=False)
importance_split = importance.sort_values(by='Split', ascending=False)

In [None]:
plt.figure(figsize=(20, 20))
plt.barh(importance_gain['Feature Name'].head(100), importance_gain['Gain'].head(100), color='skyblue')
plt.xlabel('Gain')
plt.ylabel('Feature')
plt.title('Gain')
plt.show()

In [None]:
plt.figure(figsize=(20, 20))
plt.barh(importance_split['Feature Name'].head(100), importance_split['Split'].head(100), color='lightgreen')
plt.xlabel('Split')
plt.ylabel('Feature')
plt.title('Split')
plt.show()

#### 5-Fold Cross-Validation

In [None]:
# Define parameters
final_params = {**best_params, **fixed_params}

# CV
n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
collect_metric = []
X = range(len(merged_undersample))

# Splitting the data into features and label
features = merged_undersample.drop('overall_label', axis=1)
label = merged_undersample['overall_label']

start_time = time.time()

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Running Fold {fold + 1}/{n_splits}")
    
    # Splitting each feature into training and validation sets
    features_train, features_val = features.iloc[train_idx], features.iloc[val_idx]
    label_train, label_val = label[train_idx], label[val_idx]

    # Split train into train/validation
    features_train_nested, features_val_nested, label_train_nested, label_val_nested = train_test_split(
        features_train, label_train, test_size=0.1
    )

    # Setup LGB Dataset
    dtrain_nested = lgb.Dataset(features_train_nested, label=label_train_nested, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)
    dval_nested = lgb.Dataset(features_val_nested, label=label_val_nested, categorical_feature=['sent_score', 'emotion_num'], free_raw_data=False)    

    # Train the model
    lgb_early_stop = lgb.early_stopping(25)
    clf = lgb.train(final_params, dtrain_nested, valid_sets=[dval_nested], num_boost_round=1000, callbacks=[lgb_early_stop])

    # Predict
    label_pred = clf.predict(features_val)
    label_pred_binary = (label_pred >= 0.5).astype(int)
    metric = get_metric(label_val, label_pred_binary)
    collect_metric.append(metric)

total_time = time.time() - start_time
print(f"Total Time: {total_time} seconds")

In [None]:
comb_metric = pd.concat(collect_metric, axis=0)
numeric_cols = comb_metric.select_dtypes(include='number')
final_metric = numeric_cols.groupby(numeric_cols.index).mean()
final_metric['Metric'] = collect_metric[0]['Metric']
final_metric