In [7]:
# import the relevant libraries
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
#from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

## Preprocessing before modelling

In [2]:
data = pd.read_csv('data_cleaned.csv')
data.head()

Unnamed: 0,Ptid,Persistency_Flag,Gender,Race,Ethnicity,Region,Age_Bucket,Ntm_Speciality,Ntm_Specialist_Flag,Ntm_Speciality_Bucket,...,Risk_Family_History_Of_Osteoporosis,Risk_Low_Calcium_Intake,Risk_Vitamin_D_Insufficiency,Risk_Poor_Health_Frailty,Risk_Excessive_Thinness,Risk_Hysterectomy_Oophorectomy,Risk_Estrogen_Deficiency,Risk_Immobilization,Risk_Recurring_Falls,Count_Of_Risks
0,P1,Persistent,Male,Caucasian,Not Hispanic,West,0,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,0
1,P2,Non-Persistent,Male,Asian,Not Hispanic,West,2,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,0
2,P3,Non-Persistent,Female,Other/Unknown,Hispanic,Midwest,1,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,1,0,0,0,0,0,0,0,2
3,P4,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,0,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,1
4,P5,Non-Persistent,Female,Caucasian,Not Hispanic,Midwest,0,GENERAL PRACTITIONER,Others,OB/GYN/Others/PCP/Unknown,...,0,0,0,0,0,0,0,0,0,1


In [3]:
# Notes on cleaning
# =====================
# remove Ptif col
# Persistent = 1, Non-Persistent = -1
# object cols to be one hot encoded

In [4]:
subset = data.iloc[:,:20]

for col in subset.columns:
    print(col)
    print(subset[col].nunique())

Ptid
3424
Persistency_Flag
2
Gender
2
Race
4
Ethnicity
3
Region
5
Age_Bucket
4
Ntm_Speciality
7
Ntm_Specialist_Flag
2
Ntm_Speciality_Bucket
3
Gluco_Record_Prior_Ntm
2
Gluco_Record_During_Rx
2
Dexa_Freq_During_Rx
58
Dexa_During_Rx
2
Frag_Frac_Prior_Ntm
2
Frag_Frac_During_Rx
2
Risk_Segment_Prior_Ntm
2
Tscore_Bucket_Prior_Ntm
2
Adherent_Flag
2
Idn_Indicator
2


In [5]:
data.drop('Ptid', axis=1, inplace=True)
data['Persistency_Flag'].replace({'Persistent': 1, 'Non-Persistent': -1}, inplace=True)
data = pd.get_dummies(data)

data.head()

Unnamed: 0,Persistency_Flag,Age_Bucket,Gluco_Record_Prior_Ntm,Gluco_Record_During_Rx,Dexa_Freq_During_Rx,Dexa_During_Rx,Frag_Frac_Prior_Ntm,Frag_Frac_During_Rx,Risk_Segment_Prior_Ntm,Tscore_Bucket_Prior_Ntm,...,Ntm_Speciality_OTHER,Ntm_Speciality_RHEUMATOLOGY,Ntm_Speciality_Unknown,Ntm_Specialist_Flag_Others,Ntm_Specialist_Flag_Specialist,Ntm_Speciality_Bucket_Endo/Onc/Uro,Ntm_Speciality_Bucket_OB/GYN/Others/PCP/Unknown,Ntm_Speciality_Bucket_Rheum,Adherent_Flag_Adherent,Adherent_Flag_Non-Adherent
0,1,0,0,0,0,0,0,0,1,1,...,0,0,0,1,0,0,1,0,1,0
1,-1,2,0,0,0,0,0,0,1,1,...,0,0,0,1,0,0,1,0,1,0
2,-1,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0
3,-1,0,0,1,0,0,0,0,0,1,...,0,0,0,1,0,0,1,0,1,0
4,-1,0,1,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,1,0


In [6]:
data.dtypes

Persistency_Flag                                   int64
Age_Bucket                                         int64
Gluco_Record_Prior_Ntm                             int64
Gluco_Record_During_Rx                             int64
Dexa_Freq_During_Rx                                int64
                                                   ...  
Ntm_Speciality_Bucket_Endo/Onc/Uro                 uint8
Ntm_Speciality_Bucket_OB/GYN/Others/PCP/Unknown    uint8
Ntm_Speciality_Bucket_Rheum                        uint8
Adherent_Flag_Adherent                             uint8
Adherent_Flag_Non-Adherent                         uint8
Length: 84, dtype: object

In [8]:
data['Persistency_Flag'].value_counts()

-1    2135
 1    1289
Name: Persistency_Flag, dtype: int64

In [19]:
data.shape

(3424, 84)

## Modelling

In [9]:
X = data.drop('Persistency_Flag', axis=1)
y = data['Persistency_Flag']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, shuffle=True)

In [10]:
# SVM linear
# ==============
model = SVC(kernel = 'linear', random_state=123)
model.fit(X_train, y_train)

SVC(kernel='linear', random_state=123)

In [12]:
predictions = model.predict(X_test)

In [18]:
accuracy = accuracy_score(y_test, predictions)

print(f'Accuracy upon the test data is {100*accuracy:.2f} %')

Accuracy upon the test data is 83.53 %
