# Apply logistic regression to categorize whether a county had high mortality rate due to contamination

## 1. Import the necessary packages to read in the data, plot, and create a logistic regression model

In [1]:
import pandas as pd
%matplotlib inline
import numpy as np
from sklearn.linear_model import LogisticRegression

## 2. Read in the hanford.csv file in the `data/` folder 

In [5]:
df = pd.read_csv("hanford.csv")
df.head()

Unnamed: 0,County,Exposure,Mortality
0,Umatilla,2.49,147.1
1,Morrow,2.57,130.1
2,Gilliam,3.41,129.9
3,Sherman,1.25,113.5
4,Wasco,1.62,137.5


<img src="../../images/hanford_variables.png"></img>

## 3. Calculate the basic descriptive statistics on the data

In [6]:
df.describe()

Unnamed: 0,Exposure,Mortality
count,9.0,9.0
mean,4.617778,157.344444
std,3.491192,34.791346
min,1.25,113.5
25%,2.49,130.1
50%,3.41,147.1
75%,6.41,177.9
max,11.64,210.3


In [7]:
df.median()

Exposure       3.41
Mortality    147.10
dtype: float64

In [8]:
rang= df['Mortality'].max() - df['Mortality'].min()
rang

96.800000000000011

In [10]:
iqr_m = df['Mortality'].quantile(q=0.75)- df['Mortality'].quantile(q=0.25)
iqr_m

47.800000000000011

In [11]:
iqr_e = df['Exposure'].quantile(q=0.75)- df['Exposure'].quantile(q=0.25)
iqr_e

3.9199999999999999

In [12]:
UAL_m= (iqr_m*1.5) + df['Mortality'].quantile(q=0.75)
UAL_m

249.60000000000002

In [17]:
UAL_e= (iqr_m*1.5) + df['Exposure'].quantile(q=0.75)
UAL_e

78.110000000000014

In [13]:
LAL_m= df['Mortality'].quantile(q=0.25) - (iqr_e*1.5)  
LAL_m

124.22

In [14]:
LAL_e= df['Exposure'].quantile(q=0.25) - (iqr_e*1.5)  
LAL_e

-3.3899999999999997

In [15]:
len(df[df['Mortality']> UAL_m]) 

0

In [18]:
len(df[df['Exposure']> UAL_e]) 

0

In [19]:
len(df[df['Mortality']< LAL_m]) 

1

In [20]:
len(df[df['Mortality'] > UAL_m])

0

## 4. Find a reasonable threshold to say exposure is high and recode the data 

## 5. Create a logistic regression model

In [41]:
lm = LogisticRegression()

In [49]:
data = np.asarray(df[['Mortality','Exposure']])
x = data[:,1:]
y = data[:,0]

In [50]:
data

array([[ 147.1 ,    2.49],
       [ 130.1 ,    2.57],
       [ 129.9 ,    3.41],
       [ 113.5 ,    1.25],
       [ 137.5 ,    1.62],
       [ 162.3 ,    3.83],
       [ 207.5 ,   11.64],
       [ 177.9 ,    6.41],
       [ 210.3 ,    8.34]])

In [44]:
x


array([[  2.49],
       [  2.57],
       [  3.41],
       [  1.25],
       [  1.62],
       [  3.83],
       [ 11.64],
       [  6.41],
       [  8.34]])

In [45]:
y

array([ 147.1,  130.1,  129.9,  113.5,  137.5,  162.3,  207.5,  177.9,
        210.3])

In [48]:
lm.fit(x,y)

TypeError: fit() missing 1 required positional argument: 'y'

In [35]:
lm.coef_

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [36]:
lm.score(x,y)

NotFittedError: This LogisticRegression instance is not fitted yet

In [37]:
slope = lm.coef_[0]

AttributeError: 'LogisticRegression' object has no attribute 'coef_'

In [38]:
intercept = lm.intercept_

AttributeError: 'LogisticRegression' object has no attribute 'intercept_'

## 6. Predict whether the mortality rate (Cancer per 100,000 man years) will be high at an exposure level of 50

In [40]:
lm.predict(50)

NotFittedError: This LogisticRegression instance is not fitted yet