In [24]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, f1_score
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.svm import SVC
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
from scipy.stats import uniform, loguniform
from sklearn.base import clone
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from scipy.stats import expon, randint

In [78]:
# Load data
data = pd.read_csv("train.csv", index_col=0)
data = data.drop(['index'], axis=1)

# Replace negative values with zero for specified columns
for column in ['AFP (ng/mL)', 'ALT (U/L)', 'AST (U/L)']:
    data[column] = data[column].apply(lambda x: max(x, 0))

# Separate features and target
X = data.drop('Diagnosis', axis=1)
y = data['Diagnosis']

# Identify column types
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = X.select_dtypes(include=['object']).columns

# Preprocessing steps
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),  # Apply standard scaling to numerical columns
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)  # Apply one-hot encoding to categorical columns
])

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
# SVM Pipeline
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', SFS(SVC(kernel='rbf', random_state=42), 
                             k_features=(5, 20), forward=True, floating=False, 
                             scoring='f1_macro', cv=5)),
    ('classifier', RandomizedSearchCV(SVC(kernel='rbf', random_state=42),
                                      {'C': expon(scale=1),  # Changed distribution for C
                                       'gamma': ['scale', 'auto'] + list(expon(scale=0.1).rvs(size=20))},  # Increased range and sample size for gamma
                                      n_iter=200,  # Increased iterations
                                      cv=10, scoring='f1_macro', verbose= 1,
                                      random_state=42, n_jobs=-1))
                                      ])

# Step 1: Fit the pipeline
svm_pipeline.fit(X_train, y_train)

# Step 2: Evaluate the model
y_pred_svm = svm_pipeline.predict(X_test)
f1_svm = f1_score(y_test, y_pred_svm, average='macro')
print("F1-Score (Macro):", f1_svm)

# Step 3: Extract best parameters
# The best parameters are associated with the 'classifier' step in the pipeline
best_params_svm = svm_pipeline.named_steps['classifier'].best_params_
print("Best Parameters:", best_params_svm)




Fitting 10 folds for each of 200 candidates, totalling 2000 fits
F1-Score (Macro): 0.7258241111670093
Best Parameters: {'C': 1.46340875581733, 'gamma': 0.10396290991895787}


In [66]:
# Logistic Regression Pipeline
logreg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('feature_selector', SFS(LogisticRegression(random_state=42), 
                             k_features=(5, 20), forward=True, floating=False, 
                             scoring='f1_macro', cv=5)),
    ('classifier', RandomizedSearchCV(LogisticRegression(random_state=42),
                                      {'C': loguniform(1e-4, 1e2), 'penalty': ['l1', 'l2']},
                                      n_iter=50, cv=5, scoring='f1_macro', verbose=1, 
                                      random_state=42, n_jobs=-1))
])

# Step 1: Fit the pipeline
logreg_pipeline.fit(X_train, y_train)

# Step 2: Evaluate the model
y_pred_lr = logreg_pipeline.predict(X_test)
f1_lr = f1_score(y_test, y_pred_lr, average='macro')
print("F1-Score (Macro):", f1_lr)

# Step 3: Extract best parameters
# The best parameters are associated with the 'classifier' step in the pipeline
best_params_lr = logreg_pipeline.named_steps['classifier'].best_params_
print("Best Parameters:", best_params_lr)



Fitting 5 folds for each of 50 candidates, totalling 250 fits
F1-Score (Macro): 0.7113108157868614
Best Parameters: {'C': 0.944351568796268, 'penalty': 'l2'}


90 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
90 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\base.py", line 1351, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\linear_model\_logistic.py", line 1172, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\Kristian Røh

In [80]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import f1_score
from scipy.stats import randint

# Random Forest Pipeline
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Include any preprocessing steps if needed
    ('feature_selector', SFS(SVC(kernel='rbf', random_state=42), 
                             k_features=(5, 20), forward=True, floating=False, 
                             scoring='f1_macro', cv=5)),
    ('classifier', RandomizedSearchCV(RandomForestClassifier(random_state=42),
                                      {
                                          'n_estimators': randint(10, 200),  # Number of trees in the forest
                                          'max_features': ['auto', 'sqrt'],  # Number of features to consider at every split
                                          'max_depth': [None] + list(randint(3, 30).rvs(size=20)),  # Maximum depth of the tree
                                          'min_samples_split': randint(2, 20),  # Minimum number of samples required to split an internal node
                                          'min_samples_leaf': randint(1, 20),  # Minimum number of samples required to be at a leaf node
                                          'bootstrap': [True, False]  # Method of selecting samples for training each tree
                                      },
                                      n_iter=100,  # Number of parameter settings that are sampled
                                      cv= 10,  # Number of folds for cross-validation
                                      scoring='f1_macro',  # Evaluation metric
                                      verbose=1,
                                      random_state=42,
                                      n_jobs=-1)
    )
])

# Step 1: Fit the pipeline
rf_pipeline.fit(X_train, y_train)

# Step 2: Evaluate the model
y_pred_rf = rf_pipeline.predict(X_test)
f1_rf = f1_score(y_test, y_pred_rf, average='macro')
print("F1-Score (Macro):", f1_rf)

# Step 3: Extract best parameters
# The best parameters are associated with the 'classifier' step in the pipeline
best_params_rf = rf_pipeline.named_steps['classifier'].best_params_
print("Best Parameters:", best_params_rf)




Fitting 10 folds for each of 100 candidates, totalling 1000 fits
F1-Score (Macro): 0.7618066559957316
Best Parameters: {'bootstrap': False, 'max_depth': 21, 'max_features': 'sqrt', 'min_samples_leaf': 5, 'min_samples_split': 5, 'n_estimators': 43}


460 fits failed out of a total of 1000.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
381 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sk

In [45]:
# Fit the models
svm_pipeline.fit(X, y)



Fitting 10 folds for each of 200 candidates, totalling 2000 fits


In [23]:
logreg_pipeline.fit(X, y)


STOPPING EARLY DUE TO KEYBOARD INTERRUPT...

AttributeError: SequentialFeatureSelector has not been fitted, yet.

In [71]:
rf_pipeline.fit(X, y)



Fitting 5 folds for each of 100 candidates, totalling 500 fits


230 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
167 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\model_selection\_validation.py", line 890, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\base.py", line 1344, in wrapper
    estimator._validate_params()
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\Kristian Røhne\anaconda3\envs\dat200_env\lib\site-packages\skl

In [72]:
# Evaluate and save results as needed
X_test = pd.read_csv("test.csv").drop(['index'], axis=1)
svm_predictions = svm_pipeline.predict(X_test)
logreg_predictions = logreg_pipeline.predict(X_test)
rf_predictions= rf_pipeline.predict(X_test)

# Save predictions to CSV files
pd.DataFrame({'index': X_test.index, 'Diagnosis': svm_predictions}).to_csv('svm_predictions.csv', index=False)
pd.DataFrame({'index': X_test.index, 'Diagnosis': logreg_predictions}).to_csv('logreg_predictions.csv', index=False)
pd.DataFrame({'index': X_test.index, 'Diagnosis': rf_predictions}).to_csv('rf_predictions.csv', index=False)