In [1]:
import pandas as pd

# Define the file path to your .data file
data_file_path = r'C:\Users\Supun\Desktop\Task 3\breast-cancer-wisconsin.data'

# List of feature names
feature_names = [
   "Sample code number",
   "Clump Thickness",
   "Uniformity of Cell Size",
   "Uniformity of Cell Shape",
   "Marginal Adhesion",
   "Single Epithelial Cell Size",
   "Bare Nuclei",
   "Bland Chromatin",
   "Normal Nucleoli",
   "Mitoses",
   "Class"
]

# Create the DataFrame from the .data file with specified column names
df = pd.read_csv(data_file_path, names=feature_names, header=None, delimiter=',')

df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [2]:
import numpy as np

# Replace ? with NaN
df.replace('?', np.nan, inplace=True)

# Count rows with null values
null_rows_count = df.isnull().any(axis=1).sum()

print("Number of rows with null values (including ? as null):", null_rows_count)

Number of rows with null values (including ? as null): 16


In [3]:
# Remove rows with any null (NaN) values
df = df.dropna(how='any')

df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [4]:
# Remove the "Sample code number" column
df = df.drop("Sample code number", axis=1)

df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [5]:
# Assuming the first 9 columns are the input features and the last column is the target variable
X = df.iloc[:, :-1]  # Features (first 9 columns)
y = df.iloc[:, -1]   # Target variable (last column)

In [6]:
X

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [7]:
y

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: Class, Length: 683, dtype: int64

In [8]:
# Count the occurrences of each unique item in the 'Name' column
value_counts = y.value_counts()

print(value_counts)

2    444
4    239
Name: Class, dtype: int64


In [14]:
import time
from sklearn.model_selection import cross_val_score, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC

# Create a DataFrame to store the results
results_df = pd.DataFrame(columns=["Model", "Accuracy", "Standard Deviation", "Time (s)"])

# Define the models
models = {
    "K-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Logistic Regression": LogisticRegression(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "XGBoost": XGBClassifier(),
    "SVM": SVC(),
}

# Perform 5-fold cross-validation for each model
for model_name, model in models.items():
    #kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    
    # Measure execution time
    start_time = time.time()
    
    scores = cross_val_score(model, X, y, cv=5, scoring='accuracy')
    accuracy_mean = np.mean(scores)
    accuracy_std = np.std(scores)
    
    # Calculate execution time
    end_time = time.time()
    execution_time = end_time - start_time
    
    # Append results to the DataFrame
    results_df = results_df.append({"Model": model_name, "Accuracy": accuracy_mean, "Standard Deviation": accuracy_std, "Time (s)": execution_time}, ignore_index=True)

# Print the results table
print(results_df)


                 Model  Accuracy  Standard Deviation  Time (s)
0                 K-NN  0.969311            0.023241  0.022998
1          Naive Bayes  0.959049            0.017577  0.009999
2  Logistic Regression  0.966370            0.020400  0.023999
3        Decision Tree  0.935616            0.021767  0.010003
4        Random Forest  0.966380            0.024219  0.300245
5              XGBoost       NaN                 NaN  0.004999
6                  SVM  0.963461            0.025667  0.016514


Traceback (most recent call last):
  File "C:\Users\Supun\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Supun\anaconda3\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Supun\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1467, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [2 4]

Traceback (most recent call last):
  File "C:\Users\Supun\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Supun\anaconda3\lib\site-packages\xgboost\core.py", line 729, in inner_f
    return func(**kwargs)
  File "C:\Users\Supun\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1467, in fit
    raise ValueError(
ValueError: Invalid classes inferred from