In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, precision_score, recall_score, f1_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
import warnings
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
SENTINEL = 1e99

In [2]:
# ========================================
# 1. Load datasets and preprocess
# ========================================
def load_and_preprocess(train_data_file, train_label_file, test_data_file, sentinel=SENTINEL):
    """
    Load train/test, replace sentinel with NaN, then impute column means (from train).
    Returns: X, y, test_df
    """
    train_X = np.loadtxt(train_data_file)
    train_y = np.loadtxt(train_label_file).astype(int)
    test_X  = np.loadtxt(test_data_file)

    # Replace sentinel with NaN
    train_X = np.where(train_X > sentinel/10, np.nan, train_X)
    test_X  = np.where(test_X  > sentinel/10, np.nan, test_X)

    # Put into DataFrames for easy imputation
    train_df = pd.DataFrame(train_X)
    test_df  = pd.DataFrame(test_X)

    # Impute with column means from training set
    col_means = train_df.mean()
    train_df = train_df.fillna(col_means)
    test_df  = test_df.fillna(col_means)

    X = train_df.values
    y = train_y
    return X, y, test_df

In [3]:
# ========================================
# 2. Load and preprocess dataset 1
# ========================================
X, y, test_df = load_and_preprocess(
    "./classification/TrainData1.txt",
    "./classification/TrainLabel1.txt",
    "./classification/TestData1.txt"
)

In [4]:
# ===============================
# 3. Define PCA + SVM pipeline
# ===============================

pipe_svm = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95)),  # keep 95% variance
    ("svm", SVC(kernel="linear"))
])

In [5]:
# ===============================
# 4. Define PCA + Logistic Regression pipeline
# ===============================

pipe_lr = Pipeline([
    ("scaler", StandardScaler()),
    ("pca", PCA(n_components=0.95)),
    ("logreg", LogisticRegression(max_iter=500, multi_class="auto"))
])


In [6]:
# ===============================
# 5. Metrics
# ===============================

scoring_metrics = {
    "accuracy":  "accuracy",
    "precision": "precision_macro",
    "recall":    "recall_macro",
    "f1":        "f1_macro",
}
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)


In [7]:
# ===============================
# 5. Compute metrics
# ===============================

svm_results = {m: cross_val_score(pipe_svm, X, y, cv=cv, scoring=scorer)
               for m, scorer in scoring_metrics.items()}
lr_results = {m: cross_val_score(pipe_lr, X, y, cv=cv, scoring=scorer)
              for m, scorer in scoring_metrics.items()}

In [8]:
# ===============================
# 6. Print results
# ===============================

def print_metrics(name, results):
    print(f"\n=== {name} (PCA + {name}) ===")
    for m, scores in results.items():
        print(f"{m.capitalize():<10}: mean={scores.mean():.4f}, std={scores.std():.4f}, folds={scores}")

print_metrics("SVM", svm_results)
print_metrics("Logistic Regression", lr_results)


=== SVM (PCA + SVM) ===
Accuracy  : mean=0.9733, std=0.0249, folds=[1.         0.93333333 1.         0.96666667 0.96666667]
Precision : mean=0.8974, std=0.0936, folds=[1.         0.90530303 1.         0.79090909 0.79090909]
Recall    : mean=0.9011, std=0.0895, folds=[1.         0.90530303 1.         0.8        0.8       ]
F1        : mean=0.8992, std=0.0916, folds=[1.         0.90530303 1.         0.79534884 0.79534884]

=== Logistic Regression (PCA + Logistic Regression) ===
Accuracy  : mean=0.9600, std=0.0327, folds=[0.96666667 0.9        1.         0.96666667 0.96666667]
Precision : mean=0.8848, std=0.0925, folds=[0.98913043 0.85326087 1.         0.79090909 0.79090909]
Recall    : mean=0.8594, std=0.0755, folds=[0.875     0.8219697 1.        0.8       0.8      ]
F1        : mean=0.8670, std=0.0788, folds=[0.91111111 0.83333333 1.         0.79534884 0.79534884]


In [9]:
# ===========================================================
# 7. Fit SVM models fully + predict test
# ===========================================================
pipe_svm.fit(X, y)
svm_pred = pipe_svm.predict(test_df.values)
np.savetxt("./classification/Dao_Corona_Classification1.txt",  svm_pred,  fmt="%d")

In [10]:
# ========================================
# 8. Load and preprocess dataset 2
# ========================================
X2, y2, test2_df = load_and_preprocess(
    "./classification/TrainData2.txt",
    "./classification/TrainLabel2.txt",
    "./classification/TestData2.txt"
)

In [11]:
svm_results = {m: cross_val_score(pipe_svm, X2, y2, cv=cv, scoring=scorer)
               for m, scorer in scoring_metrics.items()}
lr_results = {m: cross_val_score(pipe_lr, X2, y2, cv=cv, scoring=scorer)
              for m, scorer in scoring_metrics.items()}

In [12]:
def print_metrics(name, results):
    print(f"\n=== {name} (PCA + {name}) ===")
    for m, scores in results.items():
        print(f"{m.capitalize():<10}: mean={scores.mean():.4f}, std={scores.std():.4f}, folds={scores}")

print_metrics("SVM", svm_results)
print_metrics("Logistic Regression", lr_results)


=== SVM (PCA + SVM) ===
Accuracy  : mean=0.8900, std=0.0735, folds=[0.95 0.9  0.75 0.95 0.9 ]
Precision : mean=0.9121, std=0.0713, folds=[0.95454545 0.93939394 0.77272727 0.96969697 0.92424242]
Recall    : mean=0.8939, std=0.0852, folds=[0.95454545 0.92424242 0.72727273 0.95454545 0.90909091]
F1        : mean=0.8782, std=0.0944, folds=[0.93939394 0.91515152 0.69393939 0.95151515 0.89090909]

=== Logistic Regression (PCA + Logistic Regression) ===
Accuracy  : mean=0.9200, std=0.0400, folds=[0.95 0.9  0.85 0.95 0.95]
Precision : mean=0.9455, std=0.0281, folds=[0.95454545 0.93939394 0.89393939 0.96969697 0.96969697]
Recall    : mean=0.9303, std=0.0353, folds=[0.95454545 0.92424242 0.86363636 0.95454545 0.95454545]
F1        : mean=0.9200, std=0.0410, folds=[0.93939394 0.91515152 0.84242424 0.95151515 0.95151515]


In [13]:
# ===========================================================
# 9. Fit Logistics Regression models fully + predict test
# ===========================================================
pipe_lr.fit(X2, y2)
lr_pred = pipe_lr.predict(test2_df.values)
np.savetxt("./classification/Dao_Corona_Classification2.txt",  lr_pred,  fmt="%d")

In [14]:
X3, y3, test3_df = load_and_preprocess(
    "./classification/TrainData3.txt",
    "./classification/TrainLabel3.txt",
    "./classification/TestData3.txt"
)
X4, y4, test4_df = load_and_preprocess(
    "./classification/TrainData4.txt",
    "./classification/TrainLabel4.txt",
    "./classification/TestData4.txt"
)

In [15]:
# ==================
# Define models
# ==================
models = {
    "SVM_RBF": Pipeline([
        ("scaler", StandardScaler()),
        ("svm", SVC(kernel="rbf", C=1.0, gamma="scale"))
    ]),

    "RandomForest": RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=RANDOM_STATE
    ),

    "LogisticRegression": Pipeline([
        ("scaler", StandardScaler()),
        ("logreg", LogisticRegression(
            max_iter=1000,
            multi_class="multinomial",
            solver="lbfgs"
        ))
    ]),

    "KNN": Pipeline([
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(n_neighbors=5))
    ])
}

In [16]:
results_3 = {}

for name, model in models.items():
    print(f"\n=== Dataset 3: {name} ===")
    model_results = {}

    for metric_name, scorer in scoring_metrics.items():
        scores = cross_val_score(model, X3, y3, cv=cv, scoring=scorer)
        model_results[metric_name] = scores
        print(f"{metric_name.capitalize():<10} mean={scores.mean():.4f}, "
              f"std={scores.std():.4f}, folds={scores}")

    results_3[name] = model_results



=== Dataset 3: SVM_RBF ===
Accuracy   mean=0.9144, std=0.0077, folds=[0.91372549 0.90784314 0.90569745 0.91748527 0.92730845]
Precision  mean=0.9149, std=0.0076, folds=[0.91391909 0.90836261 0.90628337 0.91821221 0.92757996]
Recall     mean=0.9138, std=0.0078, folds=[0.91340227 0.9071333  0.90468655 0.91705867 0.92653686]
F1         mean=0.9138, std=0.0079, folds=[0.91335969 0.90717709 0.90442757 0.91711319 0.92672013]

=== Dataset 3: RandomForest ===
Accuracy   mean=0.9639, std=0.0045, folds=[0.95882353 0.95882353 0.96660118 0.96463654 0.97053045]
Precision  mean=0.9650, std=0.0045, folds=[0.95964088 0.96057428 0.96776463 0.96543575 0.97180993]
Recall     mean=0.9636, std=0.0045, folds=[0.95884553 0.95846298 0.96642908 0.96435085 0.97009681]
F1         mean=0.9638, std=0.0045, folds=[0.95895719 0.95873082 0.96633412 0.96435082 0.97046351]

=== Dataset 3: LogisticRegression ===
Accuracy   mean=0.8547, std=0.0137, folds=[0.8372549  0.85882353 0.84675835 0.87819253 0.85265226]
Precision

In [17]:
results_4 = {}

for name, model in models.items():
    print(f"\n=== Dataset 4: {name} ===")
    cv_res = cross_validate(model, X4, y4, cv=cv, scoring=scoring_metrics)

    model_results = {}
    for metric_name in scoring_metrics.keys():
        scores = cv_res[f"test_{metric_name}"]
        model_results[metric_name] = scores
        print(f"{metric_name.capitalize():<10} mean={scores.mean():.4f}, "
              f"std={scores.std():.4f}, folds={scores}")

    results_4[name] = model_results


=== Dataset 4: SVM_RBF ===
Accuracy   mean=0.6014, std=0.0226, folds=[0.56696429 0.61160714 0.625      0.62053571 0.58295964]
Precision  mean=0.2991, std=0.0214, folds=[0.30747126 0.27257816 0.32331254 0.31756509 0.27461859]
Recall     mean=0.2790, std=0.0111, folds=[0.27862353 0.26869019 0.29080547 0.29217124 0.26493147]
F1         mean=0.2802, std=0.0142, folds=[0.28462838 0.26329998 0.29448683 0.29522914 0.26346932]

=== Dataset 4: RandomForest ===
Accuracy   mean=0.6792, std=0.0295, folds=[0.63392857 0.70982143 0.69642857 0.70089286 0.65470852]
Precision  mean=0.4120, std=0.0755, folds=[0.34167448 0.35424622 0.35781584 0.52136192 0.48495997]
Recall     mean=0.3569, std=0.0298, folds=[0.30612205 0.35800687 0.3489812  0.3943626  0.37703535]
F1         mean=0.3692, std=0.0396, folds=[0.31370688 0.35490318 0.35014545 0.42494063 0.40224015]

=== Dataset 4: LogisticRegression ===
Accuracy   mean=0.5996, std=0.0167, folds=[0.58035714 0.58482143 0.61160714 0.625      0.59641256]
Precision

In [18]:
rf = RandomForestClassifier(
        n_estimators=300,
        max_depth=None,
        min_samples_split=2,
        min_samples_leaf=1,
        n_jobs=-1,
        random_state=RANDOM_STATE
    )
# ===== Train on full Dataset 3 =====
rf.fit(X3, y3)

# ===== Predict test3 =====
test3_pred = rf.predict(test3_df.values)
np.savetxt("./classification/Dao_Corona_Classification3.txt", test3_pred, fmt="%d")

# ===== Train on full Dataset 3 =====
rf.fit(X4, y4)

# ===== Predict test4 =====
test4_pred = rf.predict(test4_df.values)
np.savetxt("./classification/Dao_Corona_Classification4.txt", test4_pred, fmt="%d")