In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


/kaggle/input/breast-cancer-classification-map-6990-mini-kaggle/sample_submission.csv
/kaggle/input/breast-cancer-classification-map-6990-mini-kaggle/train.csv
/kaggle/input/breast-cancer-classification-map-6990-mini-kaggle/test.csv


In [2]:
# libraries
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Load the training data from the specified address
train_data = pd.read_csv('/kaggle/input/breast-cancer-classification-map-6990-mini-kaggle/train.csv')



## Instructions

In [3]:
# Remove the original ID column (named '0') and the competition ID column (named 'ID') as they are non-informative features.
# Remove the target variable column (named '1').
# The columns for the features (X) are intentionally unnamed (columns 2 to 31). You are welcome to name them according to your preference.
# To study the names of the features, you can refer to http://archive.ics.uci.edu/dataset/17/breast+cancer+wisconsin+diagnostic.
# We follow the same order for the features as the original dataset in UCI.

X = train_data.drop(['1','0','ID'], axis=1)
y = train_data['1']

# Preview the data
train_data.head()
train_data.shape, X.shape

((455, 33), (455, 30))

In [4]:
train_data.isnull().sum()

1     0
0     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    0
13    0
14    0
15    0
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    0
31    0
ID    0
dtype: int64

In [5]:
# Checking for multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Calculate VIF for each feature
vif = pd.DataFrame()
vif['Feature'] = X.columns
vif['VIF'] = [variance_inflation_factor(X, i) for i in range(X.shape[1])]

print("Variance Inflation Factor (VIF) values:")
print(vif)

Variance Inflation Factor (VIF) values:
   Feature           VIF
0        2  68824.552614
1        3    256.801906
2        4  63524.127628
3        5   1393.680937
4        6    384.436118
5        7    201.126342
6        8    157.908264
7        9    164.534177
8       10    183.229699
9       11    635.043198
10      12    234.785069
11      13     25.155300
12      14    216.368346
13      15     69.454100
14      16     23.642384
15      17     41.915739
16      18     34.250386
17      19     56.664855
18      20     38.833269
19      21     26.393239
20      22   9810.298448
21      23    350.735285
22      24   4931.365802
23      25   1191.284432
24      26    354.073564
25      27    133.052906
26      28     83.781954
27      29    156.527707
28      30    211.704124
29      31    434.191190


In [6]:
# Return Highly correlated features
def correlation(df, threshold):
    correlated_cols = set()
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                colname = corr_matrix.columns[i]
                correlated_cols.add(colname)
    return correlated_cols

correlation(X, 0.9)

{'14', '15', '22', '23', '24', '25', '29', '4', '5', '9'}

In [7]:
# Delete highly correlated features
vif_cols = [
    '14', '15', '22', '23', '24', '25', '28', '29', '4', '5', '8', '9'
]
X_vif = X.drop(vif_cols, axis=1 )
X_vif.shape, X.shape

((455, 18), (455, 30))

In [8]:
# Feature scaling

# Create a standar scaler instance
scale = StandardScaler()

# tranform the data with Standard Scaling
X_scale = scale.fit_transform(X_vif)

# Split the data into training and validation sets
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, test_size=0.3, random_state=45, stratify=y)

In [9]:
X.shape, X_train.shape, X_test.shape

((455, 30), (318, 18), (137, 18))

In [10]:
# Logistic Regression

# Create a Logistic Regression instance
lr = LogisticRegression()

# Train the Logistic Regression model
lr_model = lr.fit(X_train, y_train)

# Make predictions on the test set
lr_pred = lr_model.predict(X_test)

# Calculate accuracy
lr_accuracy = accuracy_score(y_test, lr_pred)
print("Logistic Regression Accuracy: %.3f"% lr_accuracy)

Logistic Regression Accuracy: 0.971


In [11]:
# L1 regularization (LASSO)
from sklearn.linear_model import LogisticRegression

# Create an L1 instance
l1 = LogisticRegression(penalty='l1', C=1.0, solver='liblinear', multi_class='ovr')
# Note that C=1.0 is the default. You can increase
# or decrease it to make the regulariztion effect
# stronger or weaker, respectively.

# Train the L1 model
l1.fit(X_train, y_train)

# Calculate accuracy
print('Training accuracy:', l1.score(X_train, y_train))
print('Test accuracy:', l1.score(X_test, y_test))

Training accuracy: 0.9685534591194969
Test accuracy: 0.9781021897810219


In [12]:
# Perceptron

# Create a Perceptron instance
perceptron = Perceptron()
#perceptron = Perceptron(eta=0.1, random_state=1)

# Train the perceptron model
perceptron.fit(X_train, y_train)

# Make predictions on the test set
perceptron_pred = perceptron.predict(X_test)

# Calculate accuracy
perceptron_accuracy = accuracy_score(y_test, perceptron_pred)
print("Perceptron Accuracy: %.3f"% perceptron_accuracy)



Perceptron Accuracy: 0.964


In [13]:
# SVM - Support Vector Machine

# Create SVM instance
svm = SVC()

# Train the SVM model
svm.fit(X_train, y_train)

# Make predictions on the test set
svm_pred = svm.predict(X_test)

# Calculate accuracy
svm_accuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy: %.3f"% svm_accuracy)



SVM Accuracy: 0.956


In [14]:
# SVM - Support Vector Machine

# Create SVM instance
svm = SVC(kernel='rbf')

# Train the SVM model
svm.fit(X_train, y_train)

# Make predictions on the test set
svm_pred = svm.predict(X_test)

# Calculate accuracy
svm_accuracy = accuracy_score(y_test, svm_pred)
print("SVM Accuracy: %.3f"% svm_accuracy)

SVM Accuracy: 0.956


In [15]:
# K-Nearest Neighbors (KNN) Model

# Creat KNN instance
knn = KNeighborsClassifier(n_neighbors=5)  # Using 5 neighbors, you can change this value as needed

# Train the KNN model
knn.fit(X_train, y_train)

# Make predictions on the validation set
knn_pred = knn.predict(X_test)

# Calculate accuracy of the KNN model
knn_accuracy = accuracy_score(y_test, knn_pred)
print("KNN Accuracy: %.3f" % knn_accuracy)

KNN Accuracy: 0.942


In [16]:
# Decision Tree

# Create a Decision Tree instance
decision_tree = DecisionTreeClassifier()

# Train the Decision Tree model
decision_tree.fit(X_train, y_train)

# Make predictions on the test set
tree_pred = decision_tree.predict(X_test)

# Calculate accuracy
tree_accuracy = accuracy_score(y_test, tree_pred)
print("Decision Tree Accuracy: %.3f"% tree_accuracy)


Decision Tree Accuracy: 0.905


In [17]:
# Random Forest

# Create a Random Forest instance
random_forest = RandomForestClassifier()

# Train the Random Forest model
random_forest.fit(X_train, y_train)

# Make predictions on the test set
forest_pred = random_forest.predict(X_test)

# Calculate accuracy
forest_accuracy = accuracy_score(y_test, forest_pred)
print("Random Forest Accuracy: %.3f"% forest_accuracy)


Random Forest Accuracy: 0.956


# Test Submission

In [18]:
# Load Test data
test_data = pd.read_csv('/kaggle/input/breast-cancer-classification-map-6990-mini-kaggle/test.csv')
test_data.head()

Unnamed: 0,0,2,3,4,5,6,7,8,9,10,...,23,24,25,26,27,28,29,30,31,ID
0,87930,12.47,18.6,81.09,481.9,0.09965,0.1058,0.08005,0.03821,0.1925,...,24.64,96.05,677.9,0.1426,0.2378,0.2671,0.1015,0.3014,0.0875,205
1,859575,18.94,21.31,123.6,1130.0,0.09009,0.1029,0.108,0.07951,0.1582,...,26.58,165.9,1866.0,0.1193,0.2336,0.2687,0.1789,0.2551,0.06589,71
2,8670,15.46,19.48,101.7,748.9,0.1092,0.1223,0.1466,0.08087,0.1931,...,26.0,124.9,1156.0,0.1546,0.2394,0.3791,0.1514,0.2837,0.08019,132
3,907915,12.4,17.68,81.47,467.8,0.1054,0.1316,0.07741,0.02799,0.1811,...,22.91,89.61,515.8,0.145,0.2629,0.2403,0.0737,0.2556,0.09359,432
4,921385,11.54,14.44,74.65,402.9,0.09984,0.112,0.06737,0.02594,0.1818,...,19.68,78.78,457.8,0.1345,0.2118,0.1797,0.06918,0.2329,0.08134,541


### Support Vector Machine (SVM) Model

In [19]:
# Data processing for test data
X = test_data.drop(['0', 'ID'], axis=1)  # Ensure test data features match training data

# Delete highly correlated features
vif_cols = [
    '14', '15', '22', '23', '24', '25', '28', '29', '4', '5', '8', '9'
]
X_vif = X.drop(vif_cols, axis=1 )

# Predictions using the L1 model
final_predictions = l1.predict(scale.transform(X_vif))

# Create a DataFrame for submission
output = pd.DataFrame({'ID': test_data['ID'], '1': final_predictions})

# Write the DataFrame to a csv file
output.to_csv('submission.csv', index=False)
