In [1]:
import pandas as pd

# Define the file path to your .data file
data_file_path = 'breast-cancer-wisconsin.data'

# List of feature names
feature_names = [
   "Sample code number",
   "Clump Thickness",
   "Uniformity of Cell Size",
   "Uniformity of Cell Shape",
   "Marginal Adhesion",
   "Single Epithelial Cell Size",
   "Bare Nuclei",
   "Bland Chromatin",
   "Normal Nucleoli",
   "Mitoses",
   "Class"
]

# Create the DataFrame from the .data file with specified column names
df = pd.read_csv(data_file_path, names=feature_names, header=None, delimiter=',')

df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [2]:
import numpy as np

# Replace ? with NaN
df.replace('?', np.nan, inplace=True)

# Count rows with null values
null_rows_count = df.isnull().any(axis=1).sum()

print("Number of rows with null values (including ? as null):", null_rows_count)

Number of rows with null values (including ? as null): 16


In [3]:
# Remove rows with any null (NaN) values
df = df.dropna(how='any')

df

Unnamed: 0,Sample code number,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,2
1,1002945,5,4,4,5,7,10,3,2,1,2
2,1015425,3,1,1,1,2,2,3,1,1,2
3,1016277,6,8,8,1,3,4,3,7,1,2
4,1017023,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...,...
694,776715,3,1,1,1,3,2,1,1,1,2
695,841769,2,1,1,1,2,1,1,1,1,2
696,888820,5,10,10,3,7,3,8,10,2,4
697,897471,4,8,6,4,3,4,10,6,1,4


In [4]:
# Remove the "Sample code number" column
df = df.drop("Sample code number", axis=1)

df

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,5,1,1,1,2,1,3,1,1,2
1,5,4,4,5,7,10,3,2,1,2
2,3,1,1,1,2,2,3,1,1,2
3,6,8,8,1,3,4,3,7,1,2
4,4,1,1,3,2,1,3,1,1,2
...,...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1,2
695,2,1,1,1,2,1,1,1,1,2
696,5,10,10,3,7,3,8,10,2,4
697,4,8,6,4,3,4,10,6,1,4


In [5]:
# Assuming the first 9 columns are the input features and the last column is the target variable
X = df.iloc[:, :-1]  # Features (first 9 columns)
y = df.iloc[:, -1]   # Target variable (last column)

In [6]:
X

Unnamed: 0,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses
0,5,1,1,1,2,1,3,1,1
1,5,4,4,5,7,10,3,2,1
2,3,1,1,1,2,2,3,1,1
3,6,8,8,1,3,4,3,7,1
4,4,1,1,3,2,1,3,1,1
...,...,...,...,...,...,...,...,...,...
694,3,1,1,1,3,2,1,1,1
695,2,1,1,1,2,1,1,1,1
696,5,10,10,3,7,3,8,10,2
697,4,8,6,4,3,4,10,6,1


In [7]:
y

0      2
1      2
2      2
3      2
4      2
      ..
694    2
695    2
696    4
697    4
698    4
Name: Class, Length: 683, dtype: int64

In [8]:
# Count the occurrences of each unique item in the 'Name' column
value_counts = y.value_counts()

print(value_counts)

2    444
4    239
Name: Class, dtype: int64


In [19]:
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier

# Define the parameter grid to search
param_grid = {
    'n_neighbors': [3, 5, 7, 9],  # List of different values for the number of neighbors
    'weights': ['uniform', 'distance'],  # Weights to use ('uniform' or 'distance')
    'p': [1, 2]  # Power parameter for Minkowski distance (1 for Manhattan, 2 for Euclidean)
}

# Create a KNN classifier
knn = KNeighborsClassifier()

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=knn, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to your data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'n_neighbors': 5, 'p': 2, 'weights': 'uniform'}
Best Score: 0.969310863031344


In [20]:
from sklearn.svm import SVC

# Define the parameter grid to search
param_grid = {
    'C': [0.1, 1, 10],  # Regularization parameter
    'kernel': ['linear', 'rbf'],  # Kernel type
    'gamma': ['scale', 'auto', 0.1, 1],  # Kernel coefficient for 'rbf' kernel
}

# Create an SVM classifier
svm_classifier = SVC()

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=svm_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to your training data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}
Best Score: 0.9678402747960497


In [21]:
from sklearn.tree import DecisionTreeClassifier

# Define the parameter grid to search
param_grid = {
    'criterion': ['gini', 'entropy'],  # Splitting criterion
    'max_depth': [None, 10, 20, 30, 40, 50],  # Maximum depth of the tree
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required to be at a leaf node
}

# Create a Decision Tree classifier
dtree = DecisionTreeClassifier()

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=dtree, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to your data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'criterion': 'gini', 'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}
Best Score: 0.9531880635465866


In [23]:
from sklearn.ensemble import RandomForestClassifier

# Create a Random Forest classifier
rf_classifier = RandomForestClassifier()

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [None, 10, 20, 30],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required to be a leaf node
    'bootstrap': [True, False]  # Whether bootstrap samples are used
}

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to your data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'bootstrap': True, 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Best Score: 0.9722198368398456


In [27]:
from sklearn.model_selection import cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

# Create a Gaussian Naive Bayes classifier
gnb_classifier = GaussianNB()
mnb_classifier = MultinomialNB()

# Perform 5-fold cross-validation and calculate accuracy
accuracy_scores_gnb = cross_val_score(gnb_classifier, X, y, cv=5, scoring='accuracy')
accuracy_scores_mnb = cross_val_score(mnb_classifier, X, y, cv=5, scoring='accuracy')

# Calculate the mean accuracy across all folds
mean_accuracy_gnb = np.mean(accuracy_scores_gnb)
print("Mean Accuracy (GaussianNB):", mean_accuracy_gnb)

mean_accuracy_mnb = np.mean(accuracy_scores_mnb)
print("Mean Accuracy (MultinomialNB):", mean_accuracy_mnb)

Mean Accuracy (GaussianNB): 0.9590489480463719
Mean Accuracy (MultinomialNB): 0.8873765564620009


In [32]:
from sklearn.ensemble import GradientBoostingClassifier

# Define the parameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],  # Number of trees in the ensemble
    'learning_rate': [0.01, 0.1, 0.2],  # Learning rate
    'max_depth': [3, 4, 5],  # Maximum depth of each tree
    'min_samples_split': [2, 3, 4]  # Minimum number of samples required to split an internal node
}

# Create a Gradient Boosting classifier
gb_classifier = GradientBoostingClassifier()

# Create a GridSearchCV object with cross-validation
grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, cv=5, n_jobs=-1)

# Fit the model to your data
grid_search.fit(X, y)

# Print the best hyperparameters and the corresponding score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

Best Hyperparameters: {'learning_rate': 0.2, 'max_depth': 3, 'min_samples_split': 3, 'n_estimators': 100}
Best Score: 0.9648990983254615
