In [61]:
import pandas as pd
from sklearn.metrics import mean_squared_error, confusion_matrix, accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
import numpy as np
import itertools
import math

#visualizing results
%matplotlib inline
import seaborn as sns
import matplotlib.pyplot as plt

from IPython.core.display import display, Image

In [62]:
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data"

names = ['BI-RADS','Age','Shape','Margin','Density','Severity']

mamm_df = pd.read_csv(url, sep = ",", names = names)

mamm_df = mamm_df.replace('?', np.nan)
mamm_df = mamm_df.dropna()
mamm_df = mamm_df.reset_index(drop=True)
mamm_df = mamm_df.apply(pd.to_numeric)

print("max BI-RADS 55 is in row: ", mamm_df['BI-RADS'].idxmax())
display(mamm_df[250:260])
mamm_df = mamm_df.drop(257,axis=0)
mamm_df = mamm_df.reset_index(drop=True)
display(mamm_df[250:260])
print("BI-RADS value counts: \n",mamm_df['BI-RADS'].value_counts())

display(mamm_df.describe())

max BI-RADS 55 is in row:  257


Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
250,5,54,4,5,3,1
251,5,55,4,3,3,1
252,4,64,4,4,3,0
253,5,67,4,5,3,1
254,5,75,4,3,3,1
255,5,87,4,4,3,1
256,4,46,4,4,3,1
257,55,46,4,3,3,1
258,5,61,1,1,3,1
259,4,44,1,4,3,0


Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
250,5,54,4,5,3,1
251,5,55,4,3,3,1
252,4,64,4,4,3,0
253,5,67,4,5,3,1
254,5,75,4,3,3,1
255,5,87,4,4,3,1
256,4,46,4,4,3,1
257,5,61,1,1,3,1
258,4,44,1,4,3,0
259,4,32,1,1,3,0


BI-RADS value counts: 
 4    468
5    316
3     24
6      9
2      7
0      5
Name: BI-RADS, dtype: int64


Unnamed: 0,BI-RADS,Age,Shape,Margin,Density,Severity
count,829.0,829.0,829.0,829.0,829.0,829.0
mean,4.332931,55.793727,2.780458,2.813028,2.915561,0.484922
std,0.68816,14.676698,1.242389,1.568107,0.351136,0.500074
min,0.0,18.0,1.0,1.0,1.0,0.0
25%,4.0,46.0,2.0,1.0,3.0,0.0
50%,4.0,57.0,3.0,3.0,3.0,0.0
75%,5.0,66.0,4.0,4.0,3.0,1.0
max,6.0,96.0,4.0,5.0,4.0,1.0


In [63]:
X = mamm_df.ix[:,1:5].values
y = mamm_df.ix[:,5].values

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

print("X_train shape: ", X_train.shape)
print("y_train shape: ", y_train.shape)

gnb = GaussianNB()

model = gnb.fit(X, y)

predictions = gnb.predict(X)

print("Predictions: \n", predictions)

X_train shape:  (621, 4)
y_train shape:  (621,)
Predictions: 
 [1 1 0 0 0 0 0 0 0 1 0 0 0 1 1 1 1 1 0 1 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 1 1
 1 0 0 0 0 1 1 1 0 0 0 1 0 1 1 0 1 0 1 1 0 1 1 0 0 1 1 1 0 0 1 1 1 0 0 1 1
 1 1 1 1 1 1 1 1 0 1 1 0 0 1 1 1 0 1 1 0 0 1 1 1 0 0 0 1 0 0 1 1 0 1 1 0 1
 0 1 1 1 1 0 0 1 1 1 1 0 1 0 0 0 1 0 1 0 0 0 1 1 1 1 1 0 1 0 0 0 1 0 1 0 0
 0 0 1 1 0 1 1 0 0 1 1 1 0 1 1 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 1 1 1 0 1
 1 0 0 1 1 1 1 1 1 1 1 1 0 1 0 1 0 0 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 0
 0 0 0 0 0 0 0 1 1 1 1 1 0 1 0 1 1 1 0 1 0 0 0 0 1 1 1 0 1 1 1 1 1 1 1 0 0
 0 0 1 0 1 1 0 0 0 0 1 0 0 0 1 1 1 0 1 0 0 0 1 1 1 1 0 1 0 1 1 1 0 1 1 0 0
 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 0 0 0 1 1 0
 1 1 0 0 1 0 1 1 1 1 1 0 1 1 0 0 1 0 0 0 1 1 1 0 0 0 1 1 1 0 1 0 0 1 1 0 1
 0 1 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 1 1 1 1 0 1 1 0 1 1 0 0 0 0 0 1 1 1 1 0
 0 1 1 1 0 0 1 0 1 0 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0 1 1 1 1 0 0 0
 1 0 1 1 1 1 1 1 1 0 1 0 0 1 1 0 0 1 

In [65]:
classification_report = classification_report(y_test, gnb.predict(X_test))

print("report :\n", classification_report)

report :
              precision    recall  f1-score   support

          0       0.86      0.69      0.77       114
          1       0.70      0.86      0.77        94

avg / total       0.79      0.77      0.77       208



In [70]:
print(accuracy_score(y, predictions))

0.793727382388
