In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('framingham.csv')

In [3]:
# Display the first 5 rows of the csv file
data.head()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
0,1,39,4.0,0,0.0,0.0,0,0,0,195.0,106.0,70.0,26.97,80.0,77.0,0
1,0,46,2.0,0,0.0,0.0,0,0,0,250.0,121.0,81.0,28.73,95.0,76.0,0
2,1,48,1.0,1,20.0,0.0,0,0,0,245.0,127.5,80.0,25.34,75.0,70.0,0
3,0,61,3.0,1,30.0,0.0,0,1,0,225.0,150.0,95.0,28.58,65.0,103.0,1
4,0,46,3.0,1,23.0,0.0,0,0,0,285.0,130.0,84.0,23.1,85.0,85.0,0


In [4]:
data.shape # It is always a good idea to understand your data

(4240, 16)

In [5]:
data.isnull().sum() # Find out how many cells have missing values

male                 0
age                  0
education          105
currentSmoker        0
cigsPerDay          29
BPMeds              53
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             50
sysBP                0
diaBP                0
BMI                 19
heartRate            1
glucose            388
TenYearCHD           0
dtype: int64

In [6]:

data=data.dropna(how='any') # Dropping any rows that has missing values

In [7]:
#creating feature data set
x=data[['male','age','currentSmoker','BPMeds','prevalentStroke','prevalentHyp','diabetes', 'totChol','sysBP']]

In [8]:
age = data['age']

In [9]:
age.describe()

count    3658.000000
mean       49.551941
std         8.562029
min        32.000000
25%        42.000000
50%        49.000000
75%        56.000000
max        70.000000
Name: age, dtype: float64

In [10]:
data.describe()

Unnamed: 0,male,age,education,currentSmoker,cigsPerDay,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP,diaBP,BMI,heartRate,glucose,TenYearCHD
count,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0,3658.0
mean,0.443685,49.551941,1.980317,0.489065,9.025424,0.030344,0.005741,0.311646,0.027064,236.847731,132.370558,82.917031,25.782802,75.730727,81.852925,0.152269
std,0.496886,8.562029,1.022656,0.499949,11.92159,0.171557,0.075561,0.463229,0.162292,44.097681,22.086866,11.974258,4.065601,11.981525,23.904164,0.359331
min,0.0,32.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,113.0,83.5,48.0,15.54,44.0,40.0,0.0
25%,0.0,42.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,206.0,117.0,75.0,23.08,68.0,71.0,0.0
50%,0.0,49.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,234.0,128.0,82.0,25.38,75.0,78.0,0.0
75%,1.0,56.0,3.0,1.0,20.0,0.0,0.0,1.0,0.0,263.0,143.875,90.0,28.0375,82.0,87.0,0.0
max,1.0,70.0,4.0,1.0,70.0,1.0,1.0,1.0,1.0,600.0,295.0,142.5,56.8,143.0,394.0,1.0


In [11]:
x.head()

Unnamed: 0,male,age,currentSmoker,BPMeds,prevalentStroke,prevalentHyp,diabetes,totChol,sysBP
0,1,39,0,0.0,0,0,0,195.0,106.0
1,0,46,0,0.0,0,0,0,250.0,121.0
2,1,48,1,0.0,0,0,0,245.0,127.5
3,0,61,1,0.0,0,1,0,225.0,150.0
4,0,46,1,0.0,0,0,0,285.0,130.0


In [12]:
y=data['TenYearCHD']
y.head()

0    0
1    0
2    0
3    1
4    0
Name: TenYearCHD, dtype: int64

In [13]:
from sklearn.cross_validation import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=0)



In [14]:
# train logistic regression model 
from sklearn.linear_model import LinearRegression
logreg = LinearRegression()
logreg.fit(x_train,y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [19]:
logreg.coef_

array([6.13641735e-02, 6.31229184e-03, 4.16980069e-02, 3.63664315e-02,
       1.78961923e-01, 1.64261671e-02, 1.45995170e-01, 1.24723913e-04,
       2.38897799e-03])

In [21]:
logreg.score(x_test, y_test)

0.11150407248245764

In [23]:
from sklearn.metrics import mean_squared_error

y_predict = logreg.predict(x_test)

regression_model_mse = mean_squared_error(y_predict, y_test)

regression_model_mse

0.11980965240290303

In [24]:
import math

math.sqrt(regression_model_mse)

0.3461353093847882

In [26]:
#Predict probablity of CHD for a 65 year old male who smokes has blood pressure, diabetes, history of stroke, is on BP meds
#and has a systolic BP of 160
logreg.predict([[1, 65, 1, 1, 1, 1, 1, 250, 160]])


array([0.73897814])

In [None]:
# result = 73.89%

In [31]:
coeff_df = pd.DataFrame(logreg.coef_, x.columns, columns=['Coefficient'])  
coeff_df 

Unnamed: 0,Coefficient
male,0.061364
age,0.006312
currentSmoker,0.041698
BPMeds,0.036366
prevalentStroke,0.178962
prevalentHyp,0.016426
diabetes,0.145995
totChol,0.000125
sysBP,0.002389


In [33]:
y_pred = logreg.predict(x_test)  
df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})  
df 

Unnamed: 0,Actual,Predicted
3079,1,0.131725
3002,0,0.294882
1707,0,0.095001
3855,0,0.117000
2990,1,0.246967
2891,0,0.293805
1078,0,0.173403
396,0,0.093283
665,0,0.276821
285,0,0.043109
