# VIRTUAL PATIENT -  DATA MODELING

**INPUTS**:
* *data/03_virtual_patients_db.xlsx*
* *data/02_112_patients_db.xlsx*
* *data/03_virtual_patients_cie_692,76.xlsx*
* *data/04_casos_específicos.xlsx*

**OUTPUTS**:
* *data/04_virtual_patients_db_lemmatized.xlsx*
* *data/04_virtual_patients_and_112_db_lemmatized.xlsx*
* *data/models/...*

**NOTAS**

### Imports

In [None]:
import pandas as pd
import sklearn as sk
import numpy as np
import nltk
import re 

nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer # instead of CountVectorizer and TfidfTransformer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.model_selection import GridSearchCV

import pickle

## SET SEED TO MAKE EXPERIMENTS CONSISTENT

In [None]:
np.random.seed(500)

## GET DATA AND PRE-PROCESSING

### IF lemmatized data has been stored

In [None]:
virtual_patients = pd.read_excel("data/04_virtual_patients_db_lemm.xlsx")
virtual_patients.fillna('', inplace=True)

db_112 = pd.read_excel("data/04_112_patients_db_lemm.xlsx")
db_112.fillna('', inplace=True)

one_cie = pd.read_excel("data/03_virtual_patients_cie_692,76.xlsx")
one_cie.fillna('', inplace=True)

space_cases = pd.read_excel("data/04_casos_específicos.xlsx")
space_cases.fillna('', inplace=True)

#### Create merged database

In [None]:
db_112["F_TEXT"] = db_112["T_L_TEXT"]
virtual_patients["F_TEXT"] = virtual_patients["L_TEXT"]

virtual_patients_and_db_112 = virtual_patients
virtual_patients_and_db_112 = virtual_patients_and_db_112.append(db_112)

### IF lemmatized data has NOT been stored

In [None]:
virtual_patients = pd.read_excel("data/03_virtual_patients_db.xlsx")
db_112 = pd.read_excel("data/02_112_patients_db.xlsx")

virtual_patients.fillna('', inplace=True)
db_112.fillna('', inplace=True)

### Text to lower case + Tokenization + Remove Stop words

(PREVIOUSLY DONE)

## FUNCTIONS

In [None]:
def get_pos_tag(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

count=0
def lemmatize(text):
    global count
    print(count)
    count = count + 1
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(w, get_pos_tag(w)) for w in nltk.word_tokenize(text)])

def get_results(y_test,y_pred):
    print(confusion_matrix(y_test,y_pred))
    print(accuracy_score(y_test, y_pred))
    print(classification_report(y_test,y_pred))

### Lemmatize individually and merge Databases

In [None]:
virtual_patients['L_TEXT'] = virtual_patients['TEXT'].apply(lemmatize)

In [None]:
db_112['L_TEXT'] = db_112['TEXT'].apply(lemmatize)

#### Merge databases

In [None]:
virtual_patients_and_db_112 = virtual_patients
virtual_patients_and_db_112.append(db_112)

## Store data

In [None]:
virtual_patients.to_excel("./data/04_virtual_patients_db_lemm.xlsx", index=False) #[['CIE','AGE','SEX','RISK','L_TEXT']]
db_112.to_excel("./data/04_112_patients_db_lemm.xlsx", index=False)[['CIE','AGE','SEX','RISK','L_TEXT']]

# 
# 
# EXPERIMENT __

### DIVIDE DATA IN TRAIN AND TEST SETS

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    virtual_patients_and_db_112.F_TEXT, 
    virtual_patients_and_db_112.RISK, 
    test_size=0.25,
    random_state=76
)

## MODEL AND TESTING - MULTINOMIAL (Naive Bayes)

### Create pipeline: TF-IDF Matrix, select n elements and define model

In [None]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7)),
                     ('selec',  SelectKBest(chi2, k='all')),
                     ('model', MultinomialNB())])

### Search for best parameters

In [None]:
parameters = {'tfidf__ngram_range': [(1, 1), (1, 2)],
              'model__alpha': (1e-2, 1e-3)}
param_pipeline = GridSearchCV(pipeline, parameters, n_jobs=-1)

### Create and Test Model

In [None]:
model1 = param_pipeline.fit(X_train, y_train)
y_pred = model1.predict(X_test)

get_results(y_test,y_pred)

## MODEL AND TESTING - LINEAR SVC

In [None]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7)),
                     ('selec',  SelectKBest(chi2, k='all')),
                     ('model', LinearSVC())])

model2 = pipeline.fit(X_train, y_train)
y_pred = model2.predict(X_test)

get_results(y_test,y_pred)

## MODEL AND TESTING - RANDOM FOREST

In [None]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7)),
                     ('selec',  SelectKBest(chi2, k='all')),
                     ('model', RandomForestClassifier(n_estimators=100))])

model3 = pipeline.fit(X_train, y_train)
y_pred = model3.predict(X_test)

get_results(y_test,y_pred)

## MODEL AND TESTING - PERCEPTRON

In [None]:
pipeline = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7)),
                     ('selec',  SelectKBest(chi2, k='all')),
                     ('model', MLPClassifier(solver='lbfgs'))])

model4 = pipeline.fit(X_train, y_train)
y_pred = model4.predict(X_test)

get_results(y_test,y_pred)

## MODEL AND TESTING - ENSEMBLE

In [None]:
from sklearn.ensemble import VotingClassifier
ensemble = VotingClassifier(
    estimators=[('nb', model1), ('ls', model2), ('rf', model3), ('pe', model4)],
    voting='hard')

ensemble = ensemble.fit(X_train, y_train)
y_pred = ensemble.predict(X_test)

get_results(y_test,y_pred)

## 
## STORE MODELS

In [None]:
pickle.dump(model1, open('web/static/models/model1.pkl', 'wb'))
pickle.dump(model2, open('web/static/models/model2.pkl', 'wb'))
pickle.dump(model3, open('web/static/models/model3.pkl', 'wb'))
pickle.dump(model4, open('web/static/models/model4.pkl', 'wb'))

## LOAD MODELS

In [None]:
model1 = pickle.load(open('web/static/models/model1.pkl', 'rb'))
model2 = pickle.load(open('web/static/models/model2.pkl', 'rb'))
model3 = pickle.load(open('web/static/models/model3.pkl', 'rb'))
model4 = pickle.load(open('web/static/models/model4.pkl', 'rb'))