In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

#SKLearn library and its functions
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import StandardScaler

In [None]:
#Open the Dataframe
df = pd.read_csv('breast-cancer.csv')
df = df.drop(['id'], axis=1)

df.head()

In [None]:
#Filter out the data so the amount of samples of M and B are the same(creating blaance)
n = 357-212 

filter_B = df[df['diagnosis']=='B']
filter_B = filter_B.drop(filter_B.sample(n).index).reset_index(drop=True)

filter_M = df[df['diagnosis']=='M']

df = pd.concat([filter_M, filter_B])


df['diagnosis'].value_counts()

In [4]:
# Drop the column that is the one you are trying to predict

class_col = df.columns[0]

X, y = df.drop(columns=[class_col]), df[class_col]




In [5]:
#Split the data with 30% test size
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,
                                                    random_state=42)

In [6]:
scaler = StandardScaler()
scaler.fit(X_train)

X_train = scaler.transform(X_train)
X_test  = scaler.transform(X_test)

In [None]:
#Perform the logisitic regression
solver = ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

models = []

for i in solver:
    log_reg = LogisticRegression(solver=i)
    log_reg.fit(X_train,y_train)
    
    models.append(log_reg)

In [None]:
#Confusion Matrix and the Classification reports for all the models
for ind, model in enumerate(models):
    y_pred = model.predict(X_test)

    print(f'Confusion matrix for the model: {solver[ind]}')
    conf_mat = confusion_matrix(y_test, y_pred)
    print(conf_mat)

    print(f'Classification report for the model: {solver[ind]}')
    report = classification_report(y_test, y_pred, target_names=['benign', 'malignant'])
    print(report)



In [None]:
# Plotting the Heatmap
df_rep = df.replace(to_replace = ['M','B'], value = [0,1])



corr_matrix = df_rep[df_rep.columns[:10]].corr()

axis_corr = sns.heatmap(
corr_matrix,
vmin=-1, vmax=1, center=0,
cmap=sns.diverging_palette(50, 500, n=500),
square=False
)

plt.show()

In [None]:
#Feature importance
coef = models[0].coef_[0]



np.sort(coef)

feat_df = pd.DataFrame({'Feature Label': X.columns, 'Coefficient':coef})

feat_df = feat_df.sort_values(by='Coefficient', ascending=False)


feat_df.head()


In [None]:
#Bar Graph for Visualization

feat_df.plot(kind='bar', x='Feature Label', y='Coefficient', title='Feature Coefficients')
