In [3]:
# k-NN implementation method
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score

# Load the data
train_df = pd.read_csv('train_tfidf_features.csv')
test_df = pd.read_csv('test_tfidf_features.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Extract features and labels
X = train_df.drop('label', axis=1)
y = train_df['label']

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_df_scaled = scaler.transform(test_df)

# Initialize and train the k-NN model
knn_model = KNeighborsClassifier(n_neighbors=5)
knn_model.fit(X_train_scaled, y_train)

# Predict and evaluate on the validation set
y_pred_knn = knn_model.predict(X_val_scaled)
y_proba_knn = knn_model.predict_proba(X_val_scaled)[:, 1]

print("k-NN Performance:")
print(classification_report(y_val, y_pred_knn))
print(f"ROC AUC: {roc_auc_score(y_val, y_proba_knn)}")

# Train on the full training data
knn_model.fit(X, y)

# Predict on the test set
test_predictions = knn_model.predict(test_df_scaled)

# Prepare submission file
submission_df['label'] = test_predictions
submission_df.to_csv('submission8.csv', index=False)


k-NN Performance:
              precision    recall  f1-score   support

           0       0.71      0.34      0.46      2153
           1       0.41      0.77      0.53      1284

    accuracy                           0.50      3437
   macro avg       0.56      0.55      0.50      3437
weighted avg       0.60      0.50      0.49      3437

ROC AUC: 0.5801278155670635


