In [9]:
# features engineering method

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the data
train_df = pd.read_csv('train_tfidf_features.csv')
test_df = pd.read_csv('test_tfidf_features.csv')
submission_df = pd.read_csv('sample_submission.csv')

# Inspect the columns to find the correct text column name
print(train_df.columns)
print(test_df.columns)

# Replace 'text' with the actual column name for text data
text_column_name = 'id'  # Update this with the actual name

# Convert the text column to strings
train_df[text_column_name] = train_df[text_column_name].astype(str)
test_df[text_column_name] = test_df[text_column_name].astype(str)

# Extract text features
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(train_df[text_column_name])
test_tfidf = vectorizer.transform(test_df[text_column_name])

# Combine TF-IDF features with other features
X_other = train_df.drop(['label', text_column_name], axis=1)
X = pd.concat([pd.DataFrame(X_tfidf.toarray()), X_other.reset_index(drop=True)], axis=1)

# Repeat for test data
X_other_test = test_df.drop(text_column_name, axis=1)
test_combined = pd.concat([pd.DataFrame(test_tfidf.toarray()), X_other_test.reset_index(drop=True)], axis=1)

# Convert all column names to strings
X.columns = X.columns.astype(str)
test_combined.columns = test_combined.columns.astype(str)

# Split data
y = train_df['label']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
test_scaled = scaler.transform(test_combined)

# PCA
pca = PCA(n_components=100)
X_train_pca = pca.fit_transform(X_train_scaled)
X_val_pca = pca.transform(X_val_scaled)
test_pca = pca.transform(test_scaled)

# k-NN model
knn_model = KNeighborsClassifier()
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 13, 15],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}

# Grid search
grid_search = GridSearchCV(knn_model, param_grid, cv=3, scoring='roc_auc', verbose=2, n_jobs=-1)
grid_search.fit(X_train_pca, y_train)
best_params = grid_search.best_params_
print(f"Best parameters: {best_params}")

# Train with best parameters
best_knn_model = KNeighborsClassifier(**best_params)
best_knn_model.fit(X_train_pca, y_train)

# Predict and evaluate
y_pred_knn = best_knn_model.predict(X_val_pca)
y_proba_knn = best_knn_model.predict_proba(X_val_pca)[:, 1]

print("k-NN Performance:")
print(classification_report(y_val, y_pred_knn))
print(f"ROC AUC: {roc_auc_score(y_val, y_proba_knn)}")

# Train on full data
best_knn_model.fit(pca.transform(X), y)

# Predict on test set
test_predictions = best_knn_model.predict(test_pca)

# Prepare submission
submission_df['label'] = test_predictions
submission_df.to_csv('submission9.csv', index=False)


Index(['id', 'label', '0', '1', '2', '3', '4', '5', '6', '7',
       ...
       '4990', '4991', '4992', '4993', '4994', '4995', '4996', '4997', '4998',
       '4999'],
      dtype='object', length=5002)
Index(['id', '0', '1', '2', '3', '4', '5', '6', '7', '8',
       ...
       '4990', '4991', '4992', '4993', '4994', '4995', '4996', '4997', '4998',
       '4999'],
      dtype='object', length=5001)
Fitting 3 folds for each of 42 candidates, totalling 126 fits
Best parameters: {'metric': 'euclidean', 'n_neighbors': 13, 'weights': 'distance'}
k-NN Performance:
              precision    recall  f1-score   support

           0       0.69      0.79      0.73      2153
           1       0.53      0.40      0.45      1284

    accuracy                           0.64      3437
   macro avg       0.61      0.59      0.59      3437
weighted avg       0.63      0.64      0.63      3437

ROC AUC: 0.6392024531444207


