In [10]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load Breast Cancer Wisconsin dataset
data = pd.read_csv('train_final2.csv')
X = data.drop(['DiagPeriodL90D'], axis=1)
y = data['DiagPeriodL90D']


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Principal Component Analysis (PCA)
pca = PCA(n_components=50)  # Choose the number of components
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)

# Singular Value Decomposition (SVD)
svd = TruncatedSVD(n_components=50)  # Choose the number of components
X_train_svd = svd.fit_transform(X_train_scaled)
X_test_svd = svd.transform(X_test_scaled)

# Chi-Square (Chi2)
chi2_selector = SelectKBest(chi2, k=50)  # Choose the number of top features (k)
X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
X_test_chi2 = chi2_selector.transform(X_test)

# Train a classifier on each feature selection method
clf_pca = RandomForestClassifier(random_state=42)
clf_pca.fit(X_train_pca, y_train)
y_pred_pca = clf_pca.predict(X_test_pca)
accuracy_pca = accuracy_score(y_test, y_pred_pca)
print(f'Accuracy with PCA: {accuracy_pca}')

clf_svd = RandomForestClassifier(random_state=42)
clf_svd.fit(X_train_svd, y_train)
y_pred_svd = clf_svd.predict(X_test_svd)
accuracy_svd = accuracy_score(y_test, y_pred_svd)
print(f'Accuracy with SVD: {accuracy_svd}')

clf_chi2 = RandomForestClassifier(random_state=42)
clf_chi2.fit(X_train_chi2, y_train)
y_pred_chi2 = clf_chi2.predict(X_test_chi2)
accuracy_chi2 = accuracy_score(y_test, y_pred_chi2)
print(f'Accuracy with Chi2: {accuracy_chi2}')


Accuracy with PCA: 0.7075910147172735
Accuracy with SVD: 0.7106893880712626
Accuracy with Chi2: 0.726568551510457


In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
data = pd.read_csv('train_final2.csv')
X = data.drop(['DiagPeriodL90D'], axis=1)
y = data['DiagPeriodL90D']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Create a DataFrame to store accuracies
results = pd.DataFrame(columns=['Method', 'n_components', 'Accuracy'])

# Principal Component Analysis (PCA)
for n in range(1, X.shape[1]+1):
    pca = PCA(n_components=n)
    X_train_pca = pca.fit_transform(X_train_scaled)
    X_test_pca = pca.transform(X_test_scaled)

    clf_pca = RandomForestClassifier(random_state=42)
    clf_pca.fit(X_train_pca, y_train)
    y_pred_pca = clf_pca.predict(X_test_pca)
    accuracy_pca = accuracy_score(y_test, y_pred_pca)

    results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)

# Singular Value Decomposition (SVD)
for n in range(1, X.shape[1]+1):
    svd = TruncatedSVD(n_components=n)
    X_train_svd = svd.fit_transform(X_train_scaled)
    X_test_svd = svd.transform(X_test_scaled)

    clf_svd = RandomForestClassifier(random_state=42)
    clf_svd.fit(X_train_svd, y_train)
    y_pred_svd = clf_svd.predict(X_test_svd)
    accuracy_svd = accuracy_score(y_test, y_pred_svd)

    results = results.append({'Method': 'SVD', 'n_components': n, 'Accuracy': accuracy_svd}, ignore_index=True)

# Chi-Square (Chi2)
for n in range(1, X.shape[1]+1):
    chi2_selector = SelectKBest(chi2, k=n)
    X_train_chi2 = chi2_selector.fit_transform(X_train, y_train)
    X_test_chi2 = chi2_selector.transform(X_test)

    clf_chi2 = RandomForestClassifier(random_state=42)
    clf_chi2.fit(X_train_chi2, y_train)
    y_pred_chi2 = clf_chi2.predict(X_test_chi2)
    accuracy_chi2 = accuracy_score(y_test, y_pred_chi2)

    results = results.append({'Method': 'Chi2', 'n_components': n, 'Accuracy': accuracy_chi2}, ignore_index=True)


pd.set_option('display.max_rows', None)

print(results)


  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results = results.append({'Method': 'PCA', 'n_components': n, 'Accuracy': accuracy_pca}, ignore_index=True)
  results 

    Method n_components  Accuracy
0      PCA            1  0.543377
1      PCA            2  0.578234
2      PCA            3  0.576685
3      PCA            4  0.579009
4      PCA            5  0.586367
5      PCA            6  0.595662
6      PCA            7  0.661890
7      PCA            8  0.670798
8      PCA            9  0.673509
9      PCA           10  0.669636
10     PCA           11  0.675058
11     PCA           12  0.675833
12     PCA           13  0.668861
13     PCA           14  0.664988
14     PCA           15  0.674671
15     PCA           16  0.668861
16     PCA           17  0.683191
17     PCA           18  0.692486
18     PCA           19  0.691712
19     PCA           20  0.685902
20     PCA           21  0.686290
21     PCA           22  0.693261
22     PCA           23  0.689388
23     PCA           24  0.688613
24     PCA           25  0.694036
25     PCA           26  0.685128
26     PCA           27  0.695198
27     PCA           28  0.681642
28     PCA    

  results = results.append({'Method': 'Chi2', 'n_components': n, 'Accuracy': accuracy_chi2}, ignore_index=True)


In [12]:
# Find the maximum accuracy for each method
max_accuracy_pca = results[results['Method'] == 'PCA']['Accuracy'].max()
max_accuracy_svd = results[results['Method'] == 'SVD']['Accuracy'].max()
max_accuracy_chi2 = results[results['Method'] == 'Chi2']['Accuracy'].max()

# Print the n_components value for maximum accuracy for each method
n_components_max_accuracy_pca = results[(results['Method'] == 'PCA') & (results['Accuracy'] == max_accuracy_pca)]['n_components'].values[0]
n_components_max_accuracy_svd = results[(results['Method'] == 'SVD') & (results['Accuracy'] == max_accuracy_svd)]['n_components'].values[0]
n_components_max_accuracy_chi2 = results[(results['Method'] == 'Chi2') & (results['Accuracy'] == max_accuracy_chi2)]['n_components'].values[0]

print("For PCA, n_components with maximum accuracy:", n_components_max_accuracy_pca)
print("For SVD, n_components with maximum accuracy:", n_components_max_accuracy_svd)
print("For Chi2, n_components with maximum accuracy:", n_components_max_accuracy_chi2)


For PCA, n_components with maximum accuracy: 50
For SVD, n_components with maximum accuracy: 48
For Chi2, n_components with maximum accuracy: 49


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.decomposition import TruncatedSVD, PCA
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Step 1: Read the dataset
df = pd.read_csv('train_final2.csv')

# Step 2: Identify features related to Fine Needle Aspiration (FNA)
# Assuming 'FNA_features' is a list of features related to FNA
# Step 2: Identify features related to Fine Needle Aspiration (FNA)
FNA_features = df.columns[df.columns != 'DiagPeriodL90D'].tolist()
  # Define your FNA features

# Subset the DataFrame with FNA features
df_fna = df[FNA_features + ['DiagPeriodL90D']]  # Include target variable if needed

# Step 3: Optionally balance the dataset using SMOTE
# Split the data into features (X) and target variable (y)
X = df_fna.drop(['DiagPeriodL90D'], axis=1)
y = df_fna['DiagPeriodL90D']

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

# Step 4: Apply dimensionality reduction techniques
# SVD
svd = TruncatedSVD(n_components=20)  # Adjust the number of components as needed
X_svd = svd.fit_transform(X_resampled)

# PCA
pca = PCA(n_components=20)  # Adjust the number of components as needed
X_pca = pca.fit_transform(X_resampled)

# Chi-square
selector = SelectKBest(chi2, k=20)  # Adjust k as needed
X_chi2 = selector.fit_transform(X_resampled, y_resampled)

# Step 5: Apply RS-SVM
# Define RS-SVM pipeline
rs_svm_pipeline = make_pipeline(StandardScaler(), SVC(kernel='linear'))

# Train RS-SVM on the original dataset
rs_svm_pipeline.fit(X_resampled, y_resampled)

# Get feature importances from RS-SVM
rs_svm_importances = rs_svm_pipeline.named_steps['svc'].coef_

# Optionally, you can use feature importances to select top features.

# Evaluate the accuracy of the RS-SVM model
accuracy_rs_svm = accuracy_score(y_resampled, rs_svm_pipeline.predict(X_resampled))
print("Accuracy of RS-SVM model:", accuracy_rs_svm)


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

# Step 1: Read the dataset
df = pd.read_csv('train_final2.csv')

# Step 2: Identify features related to Fine Needle Aspiration (FNA)
# Assuming 'FNA_features' is a list of features related to FNA
FNA_features = ['patient_id', 'patient_zip3', 'patient_age', 'disabled', 'Ozone', 'PM25', 'N02',
                'affected_site', 'patient_race_filled_encoded', 'breast_cancer_diagnosis_code_encoded', 'merged_bmi']

# Subset the DataFrame with FNA features
df_fna = df[FNA_features + ['DiagPeriodL90D']]  # Include target variable if needed

# Split the data into features (X) and target variable (y)
X = df_fna.drop(['DiagPeriodL90D'], axis=1)
y = df_fna['DiagPeriodL90D']

# Step 3: Optionally balance the dataset using SMOTE
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Apply SMOTE to balance the training dataset
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

# Step 4: Train SVM model to get feature importances
svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear', probability=True))
svm_model.fit(X_resampled, y_resampled)

# Get feature importances from SVM model
svm_importances = svm_model.named_steps['svc'].coef_[0]

# Display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': svm_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances from SVM model:")
print(feature_importance_df)

# Step 5: Optionally, you can use feature importances to select top features.

# Evaluate the accuracy of the SVM model
accuracy_svm = accuracy_score(y_resampled, svm_model.predict(X_resampled))
print("Accuracy of SVM model:", accuracy_svm)


Feature Importances from SVM model:
                                 Feature  Importance
9   breast_cancer_diagnosis_code_encoded    0.858107
2                            patient_age    0.000056
5                                   PM25    0.000008
10                            merged_bmi   -0.000018
0                             patient_id   -0.000019
8            patient_race_filled_encoded   -0.000028
1                           patient_zip3   -0.000034
4                                  Ozone   -0.000051
6                                    N02   -0.000054
3                               disabled   -0.000099
7                          affected_site   -1.611041
Accuracy of SVM model: 0.6406177246624243


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Step 1: Read the dataset
df = pd.read_csv('train_final1.csv')

# Step 2: Identify features related to Fine Needle Aspiration (FNA)
# Assuming 'FNA_features' is a list of features related to FNA
# Step 2: Identify features related to Fine Needle Aspiration (FNA)
FNA_features = df.columns[df.columns != 'DiagPeriodL90D'].tolist()

# Subset the DataFrame with FNA features
df_fna = df[FNA_features + ['DiagPeriodL90D']]  # Include target variable if needed

# Split the data into features (X) and target variable (y)
X = df_fna.drop(['DiagPeriodL90D'], axis=1)
y = df_fna['DiagPeriodL90D']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM model to get feature importances
svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear', probability=True))
svm_model.fit(X_train, y_train)

# Get feature importances from SVM model
svm_importances = svm_model.named_steps['svc'].coef_[0]

# Display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': abs(svm_importances)})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances from SVM model:")
print(feature_importance_df)

# Select the top features based on importance
top_features = feature_importance_df['Feature'].tolist()[:10]  # Adjust the number of top features as needed
print("Top Features:")
print(top_features)


Feature Importances from SVM model:
                                 Feature  Importance
69                         affected_site    1.627138
79  breast_cancer_diagnosis_code_encoded    0.878223
82                        patient_income    0.570656
23               income_household_median    0.272829
34             income_household_150_over    0.123399
..                                   ...         ...
64                      health_uninsured    0.000003
51                         self_employed    0.000002
72                         race_Hispanic    0.000002
68                                   N02    0.000002
78                             patient_F    0.000000

[83 rows x 2 columns]
Top Features:
['affected_site', 'breast_cancer_diagnosis_code_encoded', 'patient_income', 'income_household_median', 'income_household_150_over', 'income_individual_median', 'income_household_six_figure', 'never_married', 'married', 'income_household_100_to_150']


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# Step 1: Read the dataset
df = pd.read_csv('train_final2.csv')

# Step 2: Identify features related to Fine Needle Aspiration (FNA)
FNA_features = df.columns[df.columns != 'DiagPeriodL90D'].tolist()

# Subset the DataFrame with FNA features
df_fna = df[FNA_features + ['DiagPeriodL90D']]

# Split the data into features (X) and target variable (y)
X = df_fna.drop(['DiagPeriodL90D'], axis=1)
y = df_fna['DiagPeriodL90D']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train SVM model to get feature importances
svm_model = make_pipeline(StandardScaler(), SVC(kernel='linear', probability=True))
svm_model.fit(X_train, y_train)

# Get feature importances from SVM model
svm_importances = svm_model.named_steps['svc'].coef_[0]

# Display feature importances
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': abs(svm_importances)})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
print("Feature Importances from SVM model:")
print(feature_importance_df)

# Select the top features based on importance
top_features = feature_importance_df['Feature'].tolist()
print("Top Features:")
print(top_features)


Feature Importances from SVM model:
                                     Feature    Importance
69                             affected_site  1.627132e+00
75      breast_cancer_diagnosis_code_encoded  8.782296e-01
78                            patient_income  5.705612e-01
23                   income_household_median  2.728316e-01
34                 income_household_150_over  1.181046e-01
..                                       ...           ...
3                                 population  1.546333e-06
72                            payer_MEDICAID  1.425064e-06
2                                patient_age  1.288035e-06
76  metastatic_cancer_diagnosis_code_encoded  2.233854e-07
74                                 patient_F  0.000000e+00

[79 rows x 2 columns]
Top Features:
['affected_site', 'breast_cancer_diagnosis_code_encoded', 'patient_income', 'income_household_median', 'income_household_150_over', 'income_individual_median', 'income_household_six_figure', 'never_married', 'married', 