In [7]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load Breast Cancer Wisconsin dataset
data = pd.read_csv('train_final2.csv')
X = data.drop(['DiagPeriodL90D'], axis=1)
y = data['DiagPeriodL90D']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Principal Component Analysis (PCA)
pca = PCA(n_components=25)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=25)  # Choose the number of components
X_train_svd = svd.fit_transform(X_train_scaled)
X_test_svd = svd.transform(X_test_scaled)

# Chi-Square (Chi2)
chi2_selector = SelectKBest(chi2, k=25)  # Choose the number of top features (k)
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Train a classifier on each feature selection method
clf_pca = RandomForestClassifier(random_state=42)
clf_pca.fit(X_train_pca, y_train)
y_pred_pca = clf_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f'Accuracy with PCA: {accuracy_pca}')

clf_svd = RandomForestClassifier(random_state=42)
clf_svd.fit(X_train_svd, y_train)
y_pred_svd = clf_svd.predict(X_test_svd)
accuracy_svd = accuracy_score(y_test, y_pred_svd)
print(f'Accuracy with SVD: {accuracy_svd}')

clf_chi2 = RandomForestClassifier(random_state=42)
clf_chi2.fit(X_train_chi2, y_train)
y_pred_chi2 = clf_chi2.predict(X_test_chi2)
accuracy_chi2 = accuracy_score(y_test, y_pred_chi2)
print(f'Accuracy with Chi2: {accuracy_chi2}')


Accuracy with PCA: 0.6835786212238575
Accuracy with SVD: 0.6897753679318358
Accuracy with Chi2: 0.6700232378001549


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD, PCA
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Step 1: Read the dataset
df = pd.read_csv('train_final2.csv')

# Step 2: Identify features related to Fine Needle Aspiration (FNA)
# Assuming 'FNA_features' is a list of features related to FNA
# Step 2: Identify features related to Fine Needle Aspiration (FNA)
FNA_features = df.columns[df.columns != 'DiagPeriodL90D'].tolist()
  # Define your FNA features

# Subset the DataFrame with FNA features
df_fna = df[FNA_features + ['DiagPeriodL90D']]  # Include target variable if needed

# Step 3: Optionally balance the dataset using SMOTE
# Split the data into features (X) and target variable (y)
X = df_fna.drop(['DiagPeriodL90D'], axis=1)
y = df_fna['DiagPeriodL90D']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Apply dimensionality reduction techniques
# SVD
svd = TruncatedSVD(n_components=20)  # Adjust the number of components as needed
X_svd = svd.fit_transform(X_resampled)

# PCA
pca = PCA(n_components=20)  # Adjust the number of components as needed
X_pca = pca.fit_transform(X_resampled)

# Chi-square
selector = SelectKBest(chi2, k=20)  # Adjust k as needed
X_chi2 = selector.fit_transform(X_resampled, y_resampled)

# Step 5: Apply RS-SVM
# Define RS-SVM pipeline
rs_svm_pipeline = make_pipeline(StandardScaler(), SVC(kernel='linear'))

# Train RS-SVM on the original dataset
rs_svm_pipeline.fit(X_resampled, y_resampled)

# Get feature importances from RS-SVM
rs_svm_importances = rs_svm_pipeline.named_steps['svc'].coef_

# Optionally, you can use feature importances to select top features.

# Evaluate the accuracy of the RS-SVM model
accuracy_rs_svm = accuracy_score(y_resampled, rs_svm_pipeline.predict(X_resampled))
print("Accuracy of RS-SVM model:", accuracy_rs_svm)


Accuracy of RS-SVM model: 0.6885235732009926


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data = pd.read_csv('train_final2.csv')
X = data.drop(['DiagPeriodL90D'], axis=1)
y = data['DiagPeriodL90D']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Principal Component Analysis (PCA)
pca = PCA(n_components= 50)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=48)  # Choose the number of components
X_train_svd = svd.fit_transform(X_train_scaled)
X_test_svd = svd.transform(X_test_scaled)

# Chi-Square (Chi2)
chi2_selector = SelectKBest(chi2, k=49)  # Choose the number of top features (k)
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Initialize classifiers
classifiers = {
    'RandomForest': RandomForestClassifier(random_state=42),
    'GradientBoosting': GradientBoostingClassifier(random_state=42),
    'LGBM': LGBMClassifier(random_state=42),
    'XGBoost': XGBClassifier(random_state=42)
}


# Initialize variables to keep track of the best classifier and its accuracy
best_clf_name = None
best_accuracy = 0

# Train models and keep track of the best one
for method, X_train_method, X_test_method in zip(['PCA', 'SVD', 'Chi2'],
                                                 [X_train_pca, X_train_svd, X_train_chi2],
                                                 [X_test_pca, X_test_svd, X_test_chi2]):
    print(f"Results with {method}:")
    for clf_name, clf in classifiers.items():
        clf.fit(X_train_method, y_train)
        y_pred = clf.predict(X_test_method)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'{clf_name} - Accuracy: {accuracy}')

        # Update the best classifier if the current one has higher accuracy
        if accuracy > best_accuracy:
            best_clf_name = clf_name
            best_accuracy = accuracy
    print()

# Load test dataset
test_data = pd.read_csv('test_final2.csv')
# Use the best classifier to make predictions on the test data
best_clf = classifiers[best_clf_name]
best_clf.fit(X_train, y_train)  # Train on the entire training data

# Use the best classifier to predict probabilities on the test data
test_prob_predictions = best_clf.predict_proba(test_data)[:, 1]

# Round off the probabilities to 1 decimal place
rounded_prob_predictions = [round(prob, 1) for prob in test_prob_predictions]

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'patient_id': test_data['patient_id'],
    'DiagPeriodL90D': rounded_prob_predictions
})

# Write the submission DataFrame to a CSV file
# submission_df.to_csv('submission_FS3.csv', index=False)
submission_df.to_csv('submission_FS3_2.csv', index=False)


# Download the submission file
from google.colab import files
# files.download('submission_FS3.csv')
files.download('submission_FS3_2.csv')

Results with PCA:
RandomForest - Accuracy: 0.7099147947327653
GradientBoosting - Accuracy: 0.7327652982184353
[LightGBM] [Info] Number of positive: 6443, number of negative: 3881
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004114 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 12750
[LightGBM] [Info] Number of data points in the train set: 10324, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.624080 -> initscore=0.506901
[LightGBM] [Info] Start training from score 0.506901
LGBM - Accuracy: 0.7331525948876839
XGBoost - Accuracy: 0.710302091402014

Results with SVD:
RandomForest - Accuracy: 0.7013942680092952
GradientBoosting - Accuracy: 0.7300542215336948
[LightGBM] [Info] Number of positive: 6443, number of negative: 3881
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004230 seconds.
You can set `force_col_wise=true` to remo

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- DiagPeriodL90D
