In [1]:
# Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, roc_auc_score, f1_score
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt



In [2]:
# Load and preprocess training data
def load_and_preprocess(path):
    df = pd.read_json(path, lines=True)
    df['text'] = df['text'].apply(lambda tokens: ' '.join(map(str, tokens)))
    return df

df_domain1 = load_and_preprocess('domain1_train_data.json')
df_domain2 = load_and_preprocess('domain2_train_data.json')
df_combined = pd.concat([df_domain1, df_domain2], ignore_index=True)

vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df_combined['text'])
y = df_combined['label'].values



In [None]:
# Prepare K-Fold Cross-Validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Metrics storage
accuracies, f1_scores, roc_auc_scores = [], [], []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Train RandomForest and predict probabilities
    clf_rf = RandomForestClassifier(n_estimators=100, random_state=42).fit(X_train, y_train)
    rf_probs_train = clf_rf.predict_proba(X_train)[:, 1]
    rf_probs_test = clf_rf.predict_proba(X_test)[:, 1]

    # Augment training and test sets with RF probabilities
    X_train_aug = np.hstack((X_train.toarray(), rf_probs_train.reshape(-1, 1)))
    X_test_aug = np.hstack((X_test.toarray(), rf_probs_test.reshape(-1, 1)))

    # Train SVM on augmented training set and predict
    clf_svm = SVC(kernel='linear', C=1, probability=True, random_state=42).fit(X_train_aug, y_train)
    svm_preds = clf_svm.predict(X_test_aug)

    # Calculate metrics
    accuracies.append(accuracy_score(y_test, svm_preds))
    f1_scores.append(f1_score(y_test, svm_preds))
    roc_auc_scores.append(roc_auc_score(y_test, clf_svm.decision_function(X_test_aug)))
    
    print(accuracies, f1_scores, roc_auc_scores)

# Print average metrics
print(f"Average Accuracy: {np.mean(accuracies):.4f}")
print(f"Average F1 Score: {np.mean(f1_scores):.4f}")
print(f"Average ROC AUC: {np.mean(roc_auc_scores):.4f}")



[0.8308333333333333] [0.5044751830756713] [0.8784346042575821]
[0.8308333333333333, 0.84] [0.5044751830756713, 0.5199999999999999] [0.8784346042575821, 0.8863227179199401]
[0.8308333333333333, 0.84, 0.8405555555555555] [0.5044751830756713, 0.5199999999999999, 0.5400641025641026] [0.8784346042575821, 0.8863227179199401, 0.8794472461912438]


In [None]:
# Train models on the entire dataset for final prediction
clf_rf_final = RandomForestClassifier(n_estimators=100, random_state=42).fit(X, y)
rf_probs = clf_rf_final.predict_proba(X)[:, 1]

# Augment the entire dataset with RF probabilities for SVM training
X_aug = np.hstack((X.toarray(), rf_probs.reshape(-1, 1)))
clf_svm_final = SVC(kernel='linear', C=1, probability=True, random_state=42).fit(X_aug, y)



In [None]:
# Load, preprocess, and predict on test data
df_test = load_and_preprocess('test_data.json')
X_test = vectorizer.transform(df_test['text'])
rf_probs_test = clf_rf_final.predict_proba(X_test)[:, 1]
X_test_aug = np.hstack((X_test.toarray(), rf_probs_test.reshape(-1, 1)))

# Final predictions
test_predictions = clf_svm_final.predict(X_test_aug)

# Save predictions to CSV
output_df = pd.DataFrame({'id': df_test.index, 'class': test_predictions})
output_csv_path = 'final_augmented_predictions.csv'
output_df.to_csv(output_csv_path, index=False)

print(f"Final augmented predictions saved to {output_csv_path}")