# Logistic Regression Using Mammography Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split 
from sklearn import metrics

# Instantiate Models/Estimators
regression = LogisticRegression()
accuracy = metrics.accuracy_score



In [2]:
data = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/mammographic-masses/mammographic_masses.data", header = None) #column name correct
data.columns = ["BI-RADS Assessment", "Age","Shape","Margin","Density","Severity"]
data.shape # Shape of DataFrame (observations, features)
# data.head()
print (data.shape)

(961, 6)


In [3]:
# Removing Data With Missing Values
data = data.replace("?", np.NaN)
data.dropna(inplace = True)

data.head()

Unnamed: 0,BI-RADS Assessment,Age,Shape,Margin,Density,Severity
0,5,67,3,5,3,1
2,5,58,4,5,3,1
3,4,28,1,1,3,0
8,5,57,1,5,3,1
10,5,76,1,4,3,1


In [4]:
features = data.ix[:,1:6] # Counts last 5 features
x_data = features 
y_data = data["BI-RADS Assessment"] 
print (x_data.shape) 
print (y_data.shape) 

(830, 5)
(830,)


In [5]:
x_train_data, x_test_data, y_train_data, y_test_data = \
train_test_split(x_data, y_data, test_size = 0.40)

In [6]:
a = ["Observations, Features for Training Set:"]
b = ["Response vector for Training Set:"]
c = ["Observations, Features for Testing Set:"]
d = ["Response vector for Testing Set:"]

print (a, x_train_data.shape)
print (b, y_train_data.shape)
print (c, x_test_data.shape)
print (d, y_test_data.shape)

['Observations, Features for Training Set:'] (498, 5)
['Response vector for Training Set:'] (498,)
['Observations, Features for Testing Set:'] (332, 5)
['Response vector for Testing Set:'] (332,)


In [7]:
regression.fit(x_data, y_data)
regression.predict(x_data)
response_prediction = regression.predict(x_data)
# len(response_prediction)

In [8]:
print(accuracy(y_data, response_prediction))

0.802409638554


In [9]:
# Testing Model With Random Measurements From The Dataset Below
out_of_sample_data = np.array([72,4,3,3,0])
measurements = out_of_sample_data.reshape(1,-1)
x = regression.predict(measurements)
print("BI-RADS Assessment:", x)

BI-RADS Assessment: ['4']


In [10]:
data.ix[142:240,:]

Unnamed: 0,BI-RADS Assessment,Age,Shape,Margin,Density,Severity
143,4,47,1,1,2,0
145,5,62,4,5,3,1
146,5,63,4,4,3,1
148,4,71,4,4,3,1
149,4,41,1,1,3,0
150,5,57,4,4,4,1
151,5,71,4,4,4,1
152,4,66,1,1,3,0
153,4,47,2,4,2,0
154,3,34,4,4,3,0
