# Importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis, LocalOutlierFactor
from sklearn.decomposition import PCA

import warnings
warnings.filterwarnings('ignore')

sns.set(rc={'figure.figsize': (10, 10)})
plt.figure(figsize=(10, 10));

In [None]:
data = pd.read_csv('../input/breast-cancer-dataset/data.csv')
data.head()

In [None]:
data.shape

In [None]:
data.describe()

In [None]:
data.info()

In [None]:
data.rename(columns = {'diagnosis':'target'}, inplace = True)
data.head(3)

# Cleaning

In [None]:
data.isna().sum()

In [None]:
data= data.drop('id', axis =1)
data.head(3)

# Visualizing and exploring

In [None]:

data["target"] = [1 if i.strip() == "M" else 0 for i in data.target]
sns.countplot(data['target'])

In [None]:
data.target.value_counts()

In [None]:
data.corr()

In [None]:
# I visualized the relationship between the data columns.
f, ax = plt.subplots(figsize = (32,20))
sns.heatmap(data.corr(), annot = True, linewidths=0.5, linecolor = "black", fmt = ".4f", ax = ax)
plt.title("Correlation Between Features")
plt.show()

In [None]:
corr_matrix = data.corr()
threshold = 0.75
filtre = np.abs(corr_matrix['target'] > threshold)
corr_features = corr_matrix.columns[filtre].tolist()
sns.clustermap(data[corr_features].corr(), annot = True, fmt = ".2f")
plt.title("Correlation Between Features with Corr Thresgold 0.75")

In [None]:
# Box Plot
data_melt = pd.melt(data, id_vars = "target",
                   var_name = "features",
                   value_name = "value")

plt.figure(figsize = (17,7))
sns.boxplot(x = "features", y = "value", hue = "target", data = data_melt)
plt.xticks(rotation = 90)
plt.show()

 the box plots are not able to give clear indications of the outliers, we need to normalize the data.

plotting the pair plots between the highly correlated features.

In [None]:
# Pair plot
sns.pairplot(data[corr_features], diag_kind = "kde", markers = "+", hue = "target")
plt.show()

Let's look for the skewness of the features. Skewness shows us the direction of the outliers.

In [None]:
skewness = pd.DataFrame(data.skew(), columns = ['skewness'])
skewness['skewness'] = ["Positively skewed" if i >= 1 else "Negatively skewed" if i <= -1 else "Normal Distribution" for i in skewness['skewness']]
skewness

# Outlier Analysis

**From the above output we can makeout that there are outliers present in the data, so we need to get rid of those before building our models**

To determine the outliers we are using the Density based outlier detection with the help of Local Outlier Factor(LOF).

Here we have choosen 'target' feature as an independent variable and the rest as dependent variables.

In [None]:
y = data.target
x = data.drop(["target"], axis = 1)
columns = x.columns.tolist()

In [None]:
clf = LocalOutlierFactor()
y_pred = clf.fit_predict(x) # Returns -1 for anomalies/outliers and +1 for inliers.
X_score = clf.negative_outlier_factor_
X_score[:5]

In [None]:
plt.figure()
plt.scatter(x.iloc[:,0], x.iloc[:,1], color = 'k', s = 3, label = 'Data Point') # radius_mean and texture_mean as an example plot
plt.show()

In [None]:
outlier_score = pd.DataFrame()
outlier_score["score"] = X_score

# threshold for negative lof values
threshold = -2
filter_ = outlier_score["score"] < threshold
outlier_index = outlier_score[filter_].index.tolist()

# Let's plot the outliers based on threshold we set
plt.figure(figsize = (16, 9))
plt.scatter(x.iloc[outlier_index,0], x.iloc[outlier_index,1], color = 'blue', s = 40, label = 'Outliers')
plt.scatter(x.iloc[:,0], x.iloc[:,1], color = 'k', s = 3, label = 'Data Points')
radius = (X_score.max() - X_score) / (X_score.max() - X_score.min())  # Normalization
plt.scatter(x.iloc[:,0], x.iloc[:,1], s=1000*radius, edgecolors = "r", facecolors = "none", label = "Outlier Scores")
plt.legend()
plt.show()

In [None]:
# Drop outliers
x = x.drop(outlier_index)
y = y.drop(outlier_index)

# Splitting the dataset into Test data and Train data.

In [None]:
# Train-test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.3, random_state = 42)

In [None]:
# Standardization
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [None]:
y_train = y_train.reset_index().drop("index", axis = 1)
x_train = pd.DataFrame(x_train, columns = columns)
train_df = pd.concat([x_train, y_train], axis = 1)
train_df

In [None]:
train_df.describe()

In [None]:
data_melted = pd.melt(train_df, id_vars = "target", var_name = "feature", value_name = "value")
data_melted.head()

# Model building

In [None]:
#Implementing Logistic Regression using sklearn
modelLogistic = LogisticRegression()
modelLogistic.fit(x_train,y_train)
print("The intercept b0= ", modelLogistic.intercept_)

print("The coefficient b1= ", modelLogistic.coef_)
y_pred= modelLogistic.predict(x_test)
ConfusionMatrix = confusion_matrix(y_test, y_pred)
print(ConfusionMatrix)
TP= ConfusionMatrix[1,1] #True positive
TN= ConfusionMatrix[0,0] #True negative
Total=len(y_test)
print("Accuracy from confusion matrix is ", (TN+TP)/Total)

In [None]:
# KNN Model with k = 2
knn = KNeighborsClassifier(n_neighbors = 2)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
acc = accuracy_score(y_test, y_pred)
score = knn.score(x_test, y_test)
print("Score: ", score)
print("Confusion matrix: ")
print(cm)
print("Basic KNN accuracy: ", acc)

In [None]:
# Choosing best KNN parameters
def KNN_Best_Params(x_train, x_test, y_train, y_test):
    
    k_range = list(range(1,31))
    weight_options = ["uniform", "distance"]
    print()
    param_grid = dict(n_neighbors = k_range, weights = weight_options)
    
    knn = KNeighborsClassifier()
    grid = GridSearchCV(knn, param_grid, cv = 10, scoring = "accuracy")
    grid.fit(x_train, y_train)
    
    print("Best training score {} with parameters: {}".format(grid.best_score_, grid.best_params_))
    print()
    
    knn = KNeighborsClassifier(**grid.best_params_)
    knn.fit(x_train, y_train)
    
    y_pred_test = knn.predict(x_test)
    y_pred_train = knn.predict(x_train)
    
    cm_test = confusion_matrix(y_test, y_pred_test)
    cm_train = confusion_matrix(y_train, y_pred_train)
    
    acc_test = accuracy_score(y_test, y_pred_test)
    acc_train = accuracy_score(y_train, y_pred_train)
    print("Test score: {}, train score: {}".format(acc_test, acc_train))
    print()
    print("confusion matrix Test")
    print(cm_test)
    print("confusion matrix Train")
    print(cm_train)
    acc = accuracy_score(y_test, y_pred)
    print(acc)
    return grid

grid = KNN_Best_Params(x_train, x_test, y_train, y_test)

In [None]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(x_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
acc = accuracy_score(y_test, y_pred)
print(acc)

**The model is good but we can improve it by performing dimentionality reduction**

Principal Component Analysis (PCA)

In [None]:
# Assume that we have two arrays to find eigen values and eigen vectors
x2 = np.array([2.4, 0.6, 2.1, 2, 3, 2.5, 1.9, 1.1, 1.5, 1.2])
y2 = np.array([2.5, 0.7, 2.9, 2.2, 3, 2.3, 2, 1.1, 1.6, 0.8])
plt.scatter(x2,y2)

In [None]:
# Transform data to (0,0) center
x_mean = np.mean(x2)
y_mean = np.mean(y2)
x2 = x2 - x_mean
y2 = y2 - y_mean
plt.scatter(x2,y2)

In [None]:
# cov matrix
cov = np.cov(x2, y2)
cov

finding Eigen values and Eigen vectors.

In [None]:
from numpy import linalg as LA
w, v = LA.eig(cov)
w

In [None]:
v

**Here w array represents Eigen values where each is repeadted according to its multiplicity. 
v array represents The normalized (unit “length”) eigenvectors, such that the column v[:,i] is the eigenvector corresponding to the eigenvalue w[i].**

In [None]:
p1 = v[:, 1]
p2 = v[:, 0]
p1

In [None]:
# visualizing eigen vectors and eigen values
plt.scatter(x2,y2)
# main component
plt.plot([-2 * p1[0], 2 * p1[0]] , [-2 * p1[1], 2 * p1[1]])
# small component
plt.plot([-1 * p2[0], 1 * p2[0]] , [-1 * p2[1], 1 * p2[1]])

In [None]:
## PCA
scaler = StandardScaler()
x_scaled = scaler.fit_transform(x)

pca = PCA(n_components = 2) # reduction to 2 features
pca.fit(x_scaled)
x_reduced_pca = pca.transform(x_scaled)
pca_data = pd.DataFrame(x_reduced_pca, columns = ["p1","p2"])
pca_data["target"] = y.reset_index().drop("index", axis = 1)

# visualize PCA
plt.figure(figsize = (10, 6))
sns.scatterplot(x = "p1", y = "p2", hue = "target", data = pca_data)
plt.title("PCA: p1 vs p2")
plt.show()

In [None]:
# Train-test split
x_train_pca, x_test_pca, y_train_pca, y_test_pca = train_test_split(x_reduced_pca, y, test_size = 0.3, random_state = 42)
grid_pca = KNN_Best_Params(x_train_pca, x_test_pca, y_train_pca, y_test_pca)

In [None]:
# visualize PCA
cmap_light = ListedColormap(["orange", "cornflowerblue"])
cmap_bold = ListedColormap(["darkorange", "darkblue"])

h = .05
X = x_reduced_pca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 0].min() - 1, X[:, 0].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = grid_pca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap = cmap_light)

# plot also the trainin points
plt.scatter(X[:,0], X[:,1], c = y, cmap = cmap_bold, edgecolor = "k", s = 20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')" % (len(np.unique(y)), grid_pca.best_estimator_.n_neighbors, grid_pca.best_estimator_.weights))

In [None]:
knn = KNeighborsClassifier(**grid_pca.best_params_)
knn.fit(x_train_pca, y_train_pca)
y_pred_pca = knn.predict(x_test_pca)
acc_test_pca = accuracy_score(y_pred_pca, y_test_pca)
knn.score(x_test_pca, y_test_pca)

test_data = pd.DataFrame()
test_data["X_test_pca_p1"] = x_test_pca[:,0]
test_data["X_test_pca_p2"] = x_test_pca[:,1]
test_data["Y_pred_pca"] = y_pred_pca
test_data["Y_test_pca"] = y_test_pca.reset_index().drop("index", axis = 1)

plt.figure(figsize = (14, 8))
sns.scatterplot(x = "X_test_pca_p1", y = "X_test_pca_p2", hue="Y_test_pca", data = test_data)

diff = np.where(y_pred_pca != y_test_pca)[0]
plt.scatter(test_data.iloc[diff, 0], test_data.iloc[diff, 1], label = "Wrong Classified", alpha = 0.2, color = "red",s = 1000)

# Neighborhood Component Analysis (NCA)

In [None]:
# NCA
nca = NeighborhoodComponentsAnalysis(n_components = 2, random_state = 42)
nca.fit(x_scaled, y)
x_reduced_nca = nca.transform(x_scaled)
nca_data = pd.DataFrame(x_reduced_nca, columns = ["p1","p2"])
nca_data["target"] = y.reset_index().drop("index", axis = 1)

plt.figure(figsize = (10, 6))
sns.scatterplot(x = "p1", y = "p2", hue = "target", data = nca_data)
plt.title("NCA: p1 vs p2")
plt.show()

In [None]:
# Train-test split
x_train_nca, x_test_nca, y_train_nca, y_test_nca = train_test_split(x_reduced_nca, y, test_size = 0.3, random_state = 42)
grid_nca = KNN_Best_Params(x_train_nca, x_test_nca, y_train_nca, y_test_nca)

In [None]:
# visualize NCA
cmap_light = ListedColormap(["orange", "cornflowerblue"])
cmap_bold = ListedColormap(["darkorange", "darkblue"])

h = .2
X = x_reduced_nca
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 0].min() - 1, X[:, 0].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))

Z = grid_nca.predict(np.c_[xx.ravel(), yy.ravel()])

# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.figure()
plt.pcolormesh(xx, yy, Z, cmap = cmap_light)

# plot also the trainin points
plt.scatter(X[:,0], X[:,1], c = y, cmap = cmap_bold, edgecolor = "k", s = 20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("%i-Class classification (k = %i, weights = '%s')" % (len(np.unique(y)), grid_nca.best_estimator_.n_neighbors, grid_nca.best_estimator_.weights))

lets point out the wrongly classified datapoints.

In [None]:
knn = KNeighborsClassifier(**grid_nca.best_params_)
knn.fit(x_train_nca, y_train_nca)
y_pred_nca = knn.predict(x_test_nca)
acc_test_nca = accuracy_score(y_pred_nca, y_test_nca)
knn.score(x_test_nca, y_test_nca)

test_data = pd.DataFrame()
test_data["X_test_nca_p1"] = x_test_nca[:,0]
test_data["X_test_nca_p2"] = x_test_nca[:,1]
test_data["Y_pred_nca"] = y_pred_nca
test_data["Y_test_nca"] = y_test_nca.reset_index().drop("index", axis = 1)

plt.figure(figsize = (14, 8))
sns.scatterplot(x = "X_test_nca_p1", y = "X_test_nca_p2", hue="Y_test_nca", data = test_data)

diff = np.where(y_pred_nca != y_test_nca)[0]
plt.scatter(test_data.iloc[diff, 0], test_data.iloc[diff, 1], label = "Wrong Classified", alpha = 0.2, color = "red",s = 1000)

In [None]:
#Create a svm Classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
clf.fit(x_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(x_test)

cm = confusion_matrix(y_test, y_pred)
print(cm)
acc = accuracy_score(y_test, y_pred)
print(acc)