In [63]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [64]:
train = pd.read_csv('cleaned_dataset.csv')

In [65]:
train.head()

Unnamed: 0,Ptid,Persistency_Flag,Gender,Race,Ethnicity,Region,Age_Bucket,Ntm_Speciality,Ntm_Specialist_Flag,Ntm_Speciality_Bucket,...,Risk_Family_History_Of_Osteoporosis,Risk_Low_Calcium_Intake,Risk_Vitamin_D_Insufficiency,Risk_Poor_Health_Frailty,Risk_Excessive_Thinness,Risk_Hysterectomy_Oophorectomy,Risk_Estrogen_Deficiency,Risk_Immobilization,Risk_Recurring_Falls,Count_Of_Risks
0,P1,Persistent,Male,Caucasian,Not Hispanic,West,0,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,0
1,P2,Non-Persistent,Male,Asian,Not Hispanic,West,2,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,0
2,P3,Non-Persistent,Female,Caucasian,Hispanic,Midwest,1,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,1,0,0,0,0,0,0,0,2
3,P4,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,0,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,1
4,P5,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,0,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,1


In [66]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424 entries, 0 to 3423
Data columns (total 66 columns):
 #   Column                                                              Non-Null Count  Dtype 
---  ------                                                              --------------  ----- 
 0   Ptid                                                                3424 non-null   object
 1   Persistency_Flag                                                    3424 non-null   object
 2   Gender                                                              3424 non-null   object
 3   Race                                                                3424 non-null   object
 4   Ethnicity                                                           3424 non-null   object
 5   Region                                                              3424 non-null   object
 6   Age_Bucket                                                          3424 non-null   int64 
 7   Ntm_Speciality          

In [67]:
train.Ntm_Speciality_Bucket.unique()

array(['OB/GYN/Others/PCP/Unknown', 'Endo/Onc/Uro', 'Rheum'], dtype=object)

In [68]:
train['Ntm_Speciality_Bucket'] = train['Ntm_Speciality_Bucket'].replace({'OB/GYN/Others/PCP/Unknown': 0, 'Endo/Onc/Uro': 1 ,'Rheum':2})

In [70]:
train.Change_T_Score.unique()

array(['No change', 'Unknown', 'Worsened', 'Improved'], dtype=object)

In [71]:
train['Change_T_Score'] = train['Change_T_Score'].replace({'No change': 0, 'Unknown': 1 ,'Worsened':2, 'Improved':3})

In [72]:
train.Region.unique()

array(['West', 'Midwest', 'South', 'Northeast'], dtype=object)

In [73]:
train['Region'] = train['Region'].replace({'West': 0, 'Midwest': 1 ,'South':2, 'Northeast':3})

In [74]:
train.Race.unique()

array(['Caucasian', 'Asian', 'African American'], dtype=object)

In [75]:
train['Race'] = train['Race'].replace({'Caucasian': 0, 'Asian': 1 ,'African American':2})

In [76]:
gender = pd.get_dummies(train['Gender'],drop_first=True)
adherent = pd.get_dummies(train['Adherent_Flag'],drop_first=True)
persistent = pd.get_dummies(train['Persistency_Flag'],drop_first=True)
ethnicity = pd.get_dummies(train['Ethnicity'],drop_first=True)
specialist = pd.get_dummies(train['Ntm_Specialist_Flag'],drop_first=True)

In [77]:
train.drop(['Gender','Adherent_Flag','Persistency_Flag','Ethnicity','Region','Ptid','Ntm_Speciality','Ntm_Specialist_Flag' ],axis=1,inplace=True)

In [78]:
train = pd.concat([train,gender,adherent,persistent,ethnicity,specialist],axis=1)

In [80]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3424 entries, 0 to 3423
Data columns (total 63 columns):
 #   Column                                                              Non-Null Count  Dtype
---  ------                                                              --------------  -----
 0   Race                                                                3424 non-null   int64
 1   Age_Bucket                                                          3424 non-null   int64
 2   Ntm_Speciality_Bucket                                               3424 non-null   int64
 3   Gluco_Record_Prior_Ntm                                              3424 non-null   int64
 4   Gluco_Record_During_Rx                                              3424 non-null   int64
 5   Dexa_Freq_During_Rx                                                 3424 non-null   int64
 6   Dexa_During_Rx                                                      3424 non-null   int64
 7   Frag_Frac_Prior_Ntm              

In [40]:
from sklearn.model_selection import train_test_split

In [82]:
X = train.drop('Persistent',axis=1)
y = train['Persistent']

In [95]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.30, 
                                                    random_state=42)

In [104]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(2396, 62)
(2396,)
(1028, 62)
(1028,)


In [96]:
from sklearn.linear_model import LogisticRegression

In [97]:
logmodel = LogisticRegression(max_iter=1000)

In [98]:
logmodel.fit(X_train,y_train)

LogisticRegression(max_iter=1000)

In [99]:
predictions = logmodel.predict(X_test)

In [100]:
from sklearn.metrics import classification_report

In [101]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       654
           1       0.75      0.69      0.72       374

    accuracy                           0.80      1028
   macro avg       0.79      0.78      0.78      1028
weighted avg       0.80      0.80      0.80      1028



In [102]:
from sklearn.metrics import confusion_matrix

In [103]:
confusion_matrix(y_test,predictions)

array([[568,  86],
       [116, 258]])