# Steps Taken

- manual data cleaning (human observation, removing data not properly seperated)
- load to notebook
- further cleaning (removing unnecessary punctuation)

In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import os
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
import spacy
import pickle
from transformers import GPT2Tokenizer, AutoTokenizer



  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [2]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to /Users/isabel/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/isabel/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /Users/isabel/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /Users/isabel/nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [3]:
def word_cleaner(dataframe, nlp):
    # clean text to standardize (leave puncuation in as there is meaning between sentences)
    lemmatizer = WordNetLemmatizer()
    for index, row in dataframe.iterrows():
        doc = nlp(row['report'])
        list_of_words = []
        for token in doc:
            if token.pos_ == 'PROPN':
                pass
            else:
                list_of_words.append(token.text)
        r = ' '.join(list_of_words)
        r = re.sub('^a-zA-Z', ' ', r)
        r = r.replace("'s", '')
        r = r.lower()
        r = r.split()
        r = [word for word in r if word not in stopwords.words('english')]
        r = [lemmatizer.lemmatize(word) for word in r]
        r = ' '.join(r)
        r = re.sub(r'[^\w\s/]', '', r)
        r = re.sub(r'\d+', '', r)
        r = r.strip()
        dataframe.loc[index] = {'report': r, 'needs': row['needs']}
    return dataframe

In [4]:
nlp = spacy.load("en_core_web_sm")
list_of_dfs = []
for file in os.listdir('./manually_processed_data_gpt4/'):
    if '.csv' in file:
        list_of_dfs.append(word_cleaner(pd.read_csv(f'./manually_processed_data_gpt4/{file}'), nlp))

dfs = pd.concat(list_of_dfs, ignore_index=True)

In [5]:
X= np.array(list(dfs['report']))
y=np.array(list(dfs['needs']))

In [6]:
def quality_scores(true, prediction):
    # calculate true and false positives and negatives
    true_pos = sum((yt == 'met') and (yp == 'met') for yt, yp in zip(true, prediction))
    true_neg = sum((yt == 'unmet') and (yp == 'unmet') for yt, yp in zip(true, prediction))
    false_pos = sum((yt == 'unmet') and (yp == 'met') for yt, yp in zip(true, prediction))
    false_neg = sum((yt == 'met') and (yp == 'unmet') for yt, yp in zip(true, prediction))

    # precision
    prec = true_pos / (true_pos + false_pos)
    print(f'Precision: {prec}')
    # recall
    recall = true_pos / (true_pos + false_neg)
    print(f'Recall: {recall}')
    # accuracy
    accuracy = (true_pos + true_neg) / (true_pos + true_neg + false_pos + false_neg)
    print(f'Accuracy: {accuracy}')
    # F1 score
    f1 = 2 * (prec * recall) / (prec + recall)
    print(f'F1 Score: {f1}')

    return prec, recall, accuracy, f1

## Test Models

- bag of words representation + logistic regression
- gpt2 tokenization + logistic regression
- bert tokenization + logistic regression

In [7]:
model_a = pickle.load(open('./logisticRegressionModels/bow_logistic_regression.pkl', 'rb'))
model_b = pickle.load(open('./logisticRegressionModels/gpt2_logistic_regression.pkl', 'rb'))
model_c = pickle.load(open('./logisticRegressionModels/bio_clinical_bert_logistic_regression.pkl', 'rb'))
model_d = pickle.load(open('./logisticRegressionModels/tfidf_logistic_regression.pkl', 'rb'))

In [8]:
# bag of words tokenization
cv = CountVectorizer()
X_cv_temp = cv.fit_transform(X)
X_cv_temp = X_cv_temp.toarray()
X_cv = []
for temp in X_cv_temp:
    X_cv.append(temp[:2473])

In [9]:
X_cv[0]

array([0, 0, 0, ..., 0, 0, 0])

In [10]:
def gpt_tokenization(input_data, max_length):
    # gpt-2 tokens
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    list_of_gpt2_tokens = []
    for row in input_data:
        token_ids = tokenizer.encode(row, add_special_tokens=True)
        list_of_gpt2_tokens.append(token_ids)
    padded = [tokens + [0] * (max_length - len(tokens))for tokens in list_of_gpt2_tokens]
    return padded

X_gpt = gpt_tokenization(X, model_b.n_features_in_)

In [11]:
def bio_clinical_bert_tokenization(input_data, max_length):
    # bio clinical bert tokens
    tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
    list_of_bert_tokens = []
    for row in input_data:
        token_ids = tokenizer.encode(row, add_special_tokens=True)
        list_of_bert_tokens.append(token_ids)
    padded = [tokens + [0] * (max_length - len(tokens))for tokens in list_of_bert_tokens]
    return padded

X_bert = bio_clinical_bert_tokenization(X, model_c.n_features_in_)
for temp in X_bert:
    if len(temp) != model_c.n_features_in_:
        X_bert.remove(temp)

In [12]:
# tf-idf tokenization
tfidf = TfidfVectorizer()
X_tfidf_temp = tfidf.fit_transform(X)
X_tfidf_temp = X_tfidf_temp.toarray()
X_tfidf = []
for temp in X_tfidf_temp:
    X_tfidf.append(temp[:2473])

In [13]:
X_tfidf

[array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 0., 0., 0.]),
 array([0., 0., 0., ..., 

In [14]:
predictions_cv = model_a.predict(X_cv)
predictions_cv

array(['met', 'unmet', 'met', ..., 'unmet', 'met', 'met'], dtype='<U5')

In [15]:
predictions_gpt = model_b.predict(X_gpt)
predictions_gpt

array(['met', 'unmet', 'met', ..., 'met', 'unmet', 'met'], dtype='<U5')

In [16]:
predictions_bert = model_c.predict(X_bert)
predictions_bert

array(['met', 'met', 'met', ..., 'met', 'unmet', 'met'], dtype='<U5')

In [17]:
predictions_tfidf = model_d.predict(X_tfidf)
predictions_tfidf # ISABEL PROBLEM

array(['met', 'unmet', 'met', ..., 'met', 'met', 'met'], dtype='<U5')

In [18]:
print("BOW")
gpt_quality = quality_scores(y, predictions_cv)
print("\n\nGPT")
gpt_quality = quality_scores(y, predictions_gpt)
print("\n\nBERT")
bert_quality = quality_scores(y, predictions_bert)
print("\n\nTF-IDF")
tfidf_quality = quality_scores(y, predictions_tfidf)

BOW
Precision: 0.5044656606097937
Recall: 0.5287282117495158
Accuracy: 0.509195586118663
F1 Score: 0.5163120567375886


GPT
Precision: 0.5278573702642471
Recall: 0.5351839896707553
Accuracy: 0.5325443786982249
F1 Score: 0.5314954319602501


BERT
Precision: 0.5066492829204694
Recall: 0.6271788250484184
Accuracy: 0.5126359564939219
F1 Score: 0.5605077167171498


TF-IDF
Precision: 0.501117457164142
Recall: 0.6513879922530665
Accuracy: 0.5059971213817368
F1 Score: 0.5664561403508772
