In [6]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

# Load the data
df_train = pd.read_csv('../anaconda3/just-ai-249-machine-learning-lab-competition/spambase_train.csv')
df_test = pd.read_csv('../anaconda3/just-ai-249-machine-learning-lab-competition/spambase_test.csv')

# Preprocessing
X_train = df_train.drop(columns=['spam', 'ID'])
y_train = df_train["spam"]
X_test = df_test.drop(columns=['ID'])

# Normalize the data
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Feature selection
estimator = SVC(kernel='linear')
selector = RFE(estimator, n_features_to_select=10, step=40)
X_train_selected = selector.fit_transform(X_train_s, y_train)
X_test_selected = selector.transform(X_test_s)

# Train SVM with best parameters
best_svm = SVC(kernel='rbf', C= 40, gamma= 25)
best_svm.fit(X_train_selected, y_train)

# Evaluate the model on the training set
y_pred_train = best_svm.predict(X_train_selected)

# Compute accuracy
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy on training set: {accuracy_train}")

# Generate classification report
report_train = classification_report(y_train, y_pred_train)
print(report_train)

# Make predictions on the test set
y_pred_test = best_svm.predict(X_test_selected)

# Assuming test_ids contains the IDs of the test samples
submission_dataframe = pd.DataFrame({'ID': df_test['ID'], 'spam': y_pred_test})

# Save the submission dataframe to a CSV file
submission_dataframe.to_csv('submission.csv', index=False)


Accuracy on training set: 0.9211180124223602
              precision    recall  f1-score   support

           0       0.94      0.93      0.93      1951
           1       0.89      0.91      0.90      1269

    accuracy                           0.92      3220
   macro avg       0.92      0.92      0.92      3220
weighted avg       0.92      0.92      0.92      3220



In [3]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.svm import SVC
from sklearn.feature_selection import RFE

# Load the data
df_train = pd.read_csv('../anaconda3/just-ai-249-machine-learning-lab-competition/spambase_train.csv')
df_test = pd.read_csv('../anaconda3/just-ai-249-machine-learning-lab-competition/spambase_test.csv')

# Preprocessing
X_train = df_train.drop(columns=['spam', 'ID'])
y_train = df_train["spam"]
X_test = df_test.drop(columns=['ID'])

# Normalize the data
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# Feature selection with manually adjusted parameters
n_features_to_select = 15  # try different values, e.g., 10, 15, 20
step = 20                  # try different values, e.g., 10, 20, 30
estimator = SVC(kernel='linear')
selector = RFE(estimator, n_features_to_select=n_features_to_select, step=step)
X_train_selected = selector.fit_transform(X_train_s, y_train)
X_test_selected = selector.transform(X_test_s)

# Train SVM with manually adjusted parameters
C = 40                    # try different values, e.g., 0.1, 1, 10, 40
gamma = 10              # try different values, e.g., 0.01, 0.1, 1, 10
best_svm = SVC(kernel='rbf', C=C, gamma=gamma)
best_svm.fit(X_train_selected, y_train)

# Evaluate the model on the training set
y_pred_train = best_svm.predict(X_train_selected)

# Compute accuracy
accuracy_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy on training set: {accuracy_train}")

# Generate classification report
report_train = classification_report(y_train, y_pred_train)
print(report_train)

# Make predictions on the test set
y_pred_test = best_svm.predict(X_test_selected)

# Assuming test_ids contains the IDs of the test samples
submission_dataframe = pd.DataFrame({'ID': df_test['ID'], 'spam': y_pred_test})

# Save the submission dataframe to a CSV file
submission_dataframe.to_csv('submission.csv', index=False)


Accuracy on training set: 0.9496894409937888
              precision    recall  f1-score   support

           0       0.94      0.98      0.96      1951
           1       0.97      0.90      0.93      1269

    accuracy                           0.95      3220
   macro avg       0.95      0.94      0.95      3220
weighted avg       0.95      0.95      0.95      3220

