In [17]:
import pandas as pd

# Define the file path to your .data file
data_file_path = 'breast-cancer-wisconsin.data'

# List of feature names
feature_names = [
   "Sample code number",
   "Clump Thickness",
   "Uniformity of Cell Size",
   "Uniformity of Cell Shape",
   "Marginal Adhesion",
   "Single Epithelial Cell Size",
   "Bare Nuclei",
   "Bland Chromatin",
   "Normal Nucleoli",
   "Mitoses",
   "Class"
]

# Create the DataFrame from the .data file with specified column names
df = pd.read_csv(data_file_path, names=feature_names, header=None, delimiter=',')

df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [18]:
import numpy as np

# Replace ? with NaN
df.replace('?', np.nan, inplace=True)

# Count rows with null values
null_rows_count = df.isnull().any(axis=1).sum()

print("Number of rows with null values (including ? as null):", null_rows_count)

Number of rows with null values (including ? as null): 16


In [19]:
# Remove rows with any null (NaN) values
df = df.dropna(how='any')

df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [20]:
df.duplicated().sum()

8

In [21]:
dataset=df.drop_duplicates()

In [22]:
dataset

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [23]:
# Remove the "Sample code number" column
df = dataset.drop("Sample code number", axis=1)

df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [24]:
# Assuming the first 9 columns are the input features and the last column is the target variable
X = df.iloc[:, :-1]  # Features (first 9 columns)
y = df.iloc[:, -1]   # Target variable (last column)

In [25]:
X

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [26]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.




In [27]:
# check version number
import imblearn
print(imblearn.__version__)

0.11.0


In [28]:
# define undersample strategy
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority')

In [29]:
# fit and apply the transform
X_under, y_under = undersample.fit_resample(X, y)

In [30]:
#shape after the balanced
print("shape of the X : ",X_under.shape)
print("shape of the y : ",y_under.shape)

shape of the X :  (472, 9)
shape of the y :  (472,)


In [31]:
y

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: Class, Length: 675, dtype: int64

In [32]:
# Count the occurrences of each unique item in the 'Name' column
value_counts = y.value_counts()

print(value_counts)

2    439
4    236
Name: Class, dtype: int64


In [56]:
import time
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["Model", "Accuracy", "Standard Deviation", "Time (s)"])

# Define the models
models = {
    "K-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(random_state=15, criterion='gini', max_depth=None, min_samples_leaf= 4, min_samples_split=10),
    "Random Forest": RandomForestClassifier(),
    "GradientBoosting": GradientBoostingClassifier(),
    "SVM": SVC(),
}

# Perform 5-fold cross-validation for each model
for model_name, model in models.items():
    #kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Measure execution time
    start_time = time.time()
    
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    accuracy_mean = np.mean(scores)
    accuracy_std = np.std(scores)
    
    # Calculate execution time
    end_time = time.time()
    execution_time = end_time - start_time
    
    # Append results to the DataFrame
    results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)

# Print the results table
print(results_df)


  results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)
  results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)
  results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)
  results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)
  results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)


                 Model  Accuracy Standard Deviation  Time (s)
0                 K-NN  0.968889           0.021672  0.078544
1          Naive Bayes  0.958519           0.015253    0.0313
2  Logistic Regression  0.965926           0.017901  0.093697
3        Decision Tree  0.940741           0.028497  0.046824
4        Random Forest  0.962963           0.019316  0.988919
5     GradientBoosting  0.957037           0.020096  0.575107
6                  SVM  0.964444           0.022173  0.059675


  results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)
  results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)
