In [4]:
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import pandas as pd

In [5]:
model_params = {
    'svm': {
        'model': svm.SVC(gamma='auto'),
        'params' : {
            'C': [1,10,20],
            'kernel': ['rbf','linear']
        }  
    },
    'random_forest': {
        'model': RandomForestClassifier(),
        'params' : {
            'n_estimators': [1,5,10]
        }
    },
    'logistic_regression' : {
        'model': LogisticRegression(solver='liblinear',multi_class='auto'),
        'params': {
            'C': [1,5,10]
        }
    },
    'naive_bayes_gaussian': {
        'model': GaussianNB(),
        'params': {}
    },
    'naive_bayes_multinomial': {
        'model': MultinomialNB(),
        'params': {}
    },
    'decision_tree': {
        'model': DecisionTreeClassifier(),
        'params': {
            'criterion': ['gini','entropy'],
            
        }
    }     
}

In [6]:
model_data = pd.read_csv("cleaned_data.csv")
data = model_data.drop(['Target','patientId'], axis =1)
target = model_data['Target']

In [35]:
model_data.head()

Unnamed: 0,patientId,Target,PatientAge,PatientSex_F,PatientSex_M,ViewPosition_AP,ViewPosition_PA
0,0004cfab-14fd-4e49-80ba-63a80b6bddd6,0,51.0,1,0,0,1
1,00313ee0-9eaa-42f4-b0ab-c148ed3241cd,0,48.0,1,0,0,1
2,00322d4d-1c29-4943-afc9-b6754be640eb,0,19.0,0,1,1,0
3,003d8fa0-6bf1-40ed-b54c-ac657f8495c5,0,28.0,0,1,0,1
4,00436515-870c-4b36-a041-de91049b9ab4,1,32.0,1,0,1,0


In [7]:
model_data.iloc[0]

patientId          0004cfab-14fd-4e49-80ba-63a80b6bddd6
Target                                                0
PatientAge                                         51.0
PatientSex_F                                          1
PatientSex_M                                          0
ViewPosition_AP                                       0
ViewPosition_PA                                       1
Name: 0, dtype: object

In [8]:
data
target

0       0
1       0
2       0
3       0
4       1
       ..
4996    0
4997    0
4998    0
4999    0
5000    0
Name: Target, Length: 5001, dtype: int64

In [9]:
from sklearn.model_selection import GridSearchCV
import pandas as pd
scores = []


for model_name, mp in model_params.items():
    clf =  GridSearchCV(mp['model'], mp['params'], cv=5, return_train_score=False)
    clf.fit(data, target)
    scores.append({
        'model': model_name,
        'best_score': clf.best_score_,
        'best_params': clf.best_params_
    })
    
df = pd.DataFrame(scores,columns=['model','best_score','best_params'])
df

Unnamed: 0,model,best_score,best_params
0,svm,0.730257,"{'C': 1, 'kernel': 'linear'}"
1,random_forest,0.70426,{'n_estimators': 10}
2,logistic_regression,0.730257,{'C': 1}
3,naive_bayes_gaussian,0.730257,{}
4,naive_bayes_multinomial,0.730257,{}
5,decision_tree,0.70846,{'criterion': 'entropy'}


In [10]:
svm_model=svm.SVC(gamma='auto',C=1,kernel='linear',probability=True, random_state=0)

In [11]:
svm_model.fit(data, target)

In [12]:
svm_model.score(data,target)

0.730253949210158

In [13]:
from sklearn.model_selection import train_test_split
X_train,X_test,Y_train,Y_test=train_test_split(data,target,test_size=0.3)

In [14]:
svm_model.score(X_test,Y_test)

0.7295136575616256

In [15]:
svm_model.predict(X_test)

array([1, 0, 0, ..., 0, 1, 1], dtype=int64)

In [16]:
test_point=[[90,0,1,1,0],[60,1,0,0,1],[23,1,0,0,1],[12,0,1,1,0]]

In [17]:
test_point2=[[170,1,0,0,1]]

In [18]:
svm_model.predict(test_point)



array([1, 0, 0, 1], dtype=int64)

In [19]:
data

Unnamed: 0,PatientAge,PatientSex_F,PatientSex_M,ViewPosition_AP,ViewPosition_PA
0,51.0,1,0,0,1
1,48.0,1,0,0,1
2,19.0,0,1,1,0
3,28.0,0,1,0,1
4,32.0,1,0,1,0
...,...,...,...,...,...
4996,51.0,0,1,1,0
4997,63.0,1,0,1,0
4998,72.0,1,0,1,0
4999,61.0,1,0,0,1


In [20]:
Y_test

451     1
4805    0
1131    1
577     1
4053    1
       ..
392     0
2896    1
2452    1
4141    1
4000    1
Name: Target, Length: 1501, dtype: int64

In [29]:
import pickle
with open('svm_model_pickle_proba_final5k','wb') as f:
    pickle.dump(svm_model,f)
    

In [30]:
import pickle
with open('svm_model_pickle_proba_final5k','rb') as f:
    saved_model_svm=pickle.load(f)

In [31]:
saved_model_svm.predict(test_point)



array([1, 0, 0, 1], dtype=int64)

In [24]:
test_point3=[[13,1,0,0,1]]

In [25]:
svm_model.predict_proba(test_point3)



array([[0.77112356, 0.22887644]])

In [32]:
saved_model_svm.predict_proba(test_point)



array([[0.30699061, 0.69300939],
       [0.77112356, 0.22887644],
       [0.77112356, 0.22887644],
       [0.30699061, 0.69300939]])

In [33]:
saved_model_svm.predict_proba(test_point2)



array([[0.77112356, 0.22887644]])