In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Load the data
train_df = pd.read_csv('train_tfidf_features.csv')
test_df = pd.read_csv('test_tfidf_features.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Extract features and labels
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_df)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Initialize RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_val_scaled)
y_proba_rf = rf_model.predict_proba(X_val_scaled)[:, 1]

print("Random Forest Performance:")
print(classification_report(y_val, y_pred_rf))
print(f"ROC AUC: {roc_auc_score(y_val, y_proba_rf)}")


In [None]:
import xgboost as xgb

# Initialize XGBoostClassifier
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_xgb = xgb_model.predict(X_val_scaled)
y_proba_xgb = xgb_model.predict_proba(X_val_scaled)[:, 1]

print("XGBoost Performance:")
print(classification_report(y_val, y_pred_xgb))
print(f"ROC AUC: {roc_auc_score(y_val, y_proba_xgb)}")


In [None]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('xgb', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42))
]

# Define meta-model
meta_model = LogisticRegression()

# Initialize StackingClassifier
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)
stacking_model.fit(X_train_scaled, y_train)

# Predict and evaluate
y_pred_stack = stacking_model.predict(X_val_scaled)
y_proba_stack = stacking_model.predict_proba(X_val_scaled)[:, 1]

print("Stacking Performance:")
print(classification_report(y_val, y_pred_stack))
print(f"ROC AUC: {roc_auc_score(y_val, y_proba_stack)}")


In [None]:
# Predict on the test set using the best performing model (e.g., stacking_model)
test_predictions = stacking_model.predict(test_scaled)

# Prepare submission file
submission_df['label'] = test_predictions
submission_df.to_csv('submission.csv', index=False)
