In [1]:
import numpy as np
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
patients = pd.read_csv('/Users/yousef/Downloads/RobotDoc-main 3/db/admissions.csv')

In [2]:
symptoms = patients['symptoms']
diagnoses = patients['diagnoses']
X = pd.concat([symptoms, diagnoses] , axis=1)
X = X[X['symptoms'].notna()]
X = X.replace(np.nan, '', regex=True)
X['symptoms'] = X['symptoms'].apply(lambda x: str(x).split(';'))
X['symptoms'] = X['symptoms'].apply(lambda x: ' '.join(x))

In [3]:
z = X['diagnoses']
mlb = MultiLabelBinarizer(sparse_output=True)
diagnoses = pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(z.str.split(';')),
                                          columns=mlb.classes_)

mlb = MultiLabelBinarizer(sparse_output=True, classes = mlb.classes_)
diagnoses = pd.DataFrame.sparse.from_spmatrix(mlb.fit_transform(z.str.split(';')),
                                          columns=mlb.classes_)
joblib.dump(mlb, "/Users/yousef/Downloads/RobotDoc-main/encoders/diagnoses_enc.pkl")
print(diagnoses)





          0030  0031  0038  0041  0051  00581  0059  0071  0074  ...  99932  \
0      0     0     0     0     0     0      0     0     0     0  ...      0   
1      0     0     0     0     0     0      0     0     0     0  ...      0   
2      0     0     0     0     0     0      0     0     0     0  ...      0   
3      0     0     0     0     0     0      0     0     0     0  ...      0   
4      0     0     0     0     0     0      0     0     0     0  ...      0   
...   ..   ...   ...   ...   ...   ...    ...   ...   ...   ...  ...    ...   
26334  0     0     0     0     0     0      0     0     0     0  ...      0   
26335  0     0     0     0     0     0      0     0     0     0  ...      0   
26336  0     0     0     0     0     0      0     0     0     0  ...      0   
26337  0     0     0     0     0     0      0     0     0     0  ...      0   
26338  0     0     0     0     0     0      0     0     0     0  ...      0   

       99933  99939  99941  9995  9998  99982  9998

In [5]:
xtrain, xval, ytrain, yval = train_test_split(X['symptoms'], diagnoses, test_size=0.2, random_state=99)

vectorizer = CountVectorizer()

LogReg_pipeline = Pipeline([
    ('vectorizer', vectorizer),
    ('clf', OneVsRestClassifier(LogisticRegression(solver='sag'), 
                                n_jobs=-1))
                           ])

LogReg_pipeline.fit(xtrain, ytrain)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('clf',
                 OneVsRestClassifier(estimator=LogisticRegression(solver='sag'),
                                     n_jobs=-1))])

In [10]:
joblib.dump(LogReg_pipeline, '/Users/yousef/Downloads/RobotDoc-main/encoders/LogReg_pipeline.joblib') 

['/Users/yousef/Downloads/RobotDoc-main/encoders/LogReg_pipeline.joblib']

In [6]:
y_pred = LogReg_pipeline.predict(xval)

In [7]:
f1_score(yval, y_pred, average="micro")

0.07164218129524619

In [8]:
y_pred_prob = LogReg_pipeline.predict_proba(xval)
t = 0.1
# threshold value
y_pred_new = (y_pred_prob >= t).astype(int)
f1_score(yval, y_pred_new, average="micro")

0.2480447650160106

In [9]:
print("Predicted")
print(mlb.inverse_transform(y_pred_new)[7])
print("-----------------------------------------------")
print("expected")
print(mlb.inverse_transform(yval.to_numpy())[7])

Predicted
('0389', '2449', '25000', '2639', '2753', '2761', '2762', '2767', '2851', '2859', '2875', '2930', '311', '4019', '40390', '40391', '4168', '42731', '42789', '4280', '486', '49390', '5119', '51881', '53081', '53789', '5601', '5715', '5722', '5723', '5845', '5849', '5856', '5859', '5990', '6826', '99591', '99592', '99731')
-----------------------------------------------
expected
('0389', '2753', '2760', '2761', '2768', '2819', '2869', '2875', '2910', '30391', '3051', '570', '5711', '5712', '5722', '57420', '5770', '5990', '99591')
