In [3]:
import pandas as pd
import numpy as np

# Data Loading

In [26]:
dataset = 'title_abstract_keywords'

In [27]:
df = pd.read_pickle("data/train_{}.pkl".format(dataset))

In [28]:
df.head()

Unnamed: 0,label,text
0,1,Comparing measurement properties of EQ-5D-Y-3L...
1,0,Feasibility of the EQ-5D in the elderly popula...
2,1,Comparing the self-reported health-related qua...
3,1,Testing measurement properties of two EQ-5D yo...
4,1,Use of Antimalarial Agents is Associated with ...


In [29]:
df.iloc[0, :]

label                                                    1
text     Comparing measurement properties of EQ-5D-Y-3L...
Name: 0, dtype: object

In [30]:
#subsets should be fixed for all tests
#_val_ids = [2, 7, 24, 32, 36, 47, 49, 59, 61, 71, 72, 86, 90, 95, 96]
#train_dataset = df[~df.index.isin(_val_ids)]
#val_dataset = df[df.index.isin(_val_ids)]

In [31]:
#np.sum(train_dataset["label"]) / len(train_dataset["label"]), np.sum(val_dataset["label"]) / len(val_dataset["label"])

In [32]:
train_dataset = df

In [33]:
df = pd.read_pickle("data/test_{}.pkl".format(dataset))

In [34]:
test_dataset = df

In [35]:
np.sum(test_dataset["label"]) / len(test_dataset["label"])

0.6

# Bag of words as a baseline

In [36]:
import nltk
import re

In [37]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize as wt 

nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
def dataset_prepare(dataset, data=None):
    if data is None:
        data = []
        for i in range(dataset.shape[0]):
            sms = dataset.iloc[i, 1]
            sms = re.sub('[^A-Za-z]', ' ', sms)
            sms = sms.lower()
            tokenized_sms = wt(sms)
            sms_processed = []
            for word in tokenized_sms:
                if word not in set(stopwords.words('english')):
                    sms_processed.append(word)
            sms_text = " ".join(sms_processed)
            data.append(sms_text)
    from sklearn.feature_extraction.text import CountVectorizer
    matrix = CountVectorizer(max_features=1000)
    X = matrix.fit_transform(data).toarray()
    y = dataset.iloc[:, 0]
    return X, y, data

In [39]:
X_train, y_train, d = dataset_prepare(train_dataset)
X_test, y_test, _ = dataset_prepare(test_dataset, d)

In [40]:
X_train

array([[0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 0, 1],
       ...,
       [0, 0, 0, ..., 0, 0, 1],
       [0, 0, 0, ..., 0, 2, 1],
       [0, 0, 2, ..., 0, 1, 1]])

In [41]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((100, 1000), (100,), (100, 1000), (100,))

In [19]:
train_dataset

Unnamed: 0,label,text
0,1,Comparing measurement properties of EQ-5D-Y-3L...
1,0,Feasibility of the EQ-5D in the elderly popula...
2,1,Comparing the self-reported health-related qua...
3,1,Testing measurement properties of two EQ-5D yo...
4,1,Use of Antimalarial Agents is Associated with ...
...,...,...
95,0,[Raloxifene in clinical practice. Results of t...
96,1,Impact of apathy on health-related quality of ...
97,0,Measuring health-related quality of life by ex...
98,0,"Efficacy of labral repair, biceps tenodesis, a..."


In [42]:
#d

In [43]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB()

In [44]:
# predict class
y_pred = classifier.predict(X_test)

# Confusion matrix
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
cm = confusion_matrix(y_test, y_pred)
cr = classification_report(y_test, y_pred)

accuracy = accuracy_score(y_test, y_pred)

In [45]:
cm

array([[16, 24],
       [24, 36]])

In [46]:
print(cr)

              precision    recall  f1-score   support

           0       0.40      0.40      0.40        40
           1       0.60      0.60      0.60        60

    accuracy                           0.52       100
   macro avg       0.50      0.50      0.50       100
weighted avg       0.52      0.52      0.52       100



In [47]:
accuracy

0.52

In [48]:
from sklearn.metrics import precision_score, f1_score, recall_score
weightedF1 = f1_score(y_test, y_pred, average = 'weighted')
weightedRecall = recall_score(y_test, y_pred, average='weighted')
weightedPrecision = precision_score(y_test, y_pred, average='weighted')

In [49]:
weightedPrecision, weightedRecall, weightedF1

(0.52, 0.52, 0.52)

In [50]:
y_pred = classifier.predict(X_train)
print(classification_report(y_train, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99        39
           1       1.00      0.98      0.99        61

    accuracy                           0.99       100
   macro avg       0.99      0.99      0.99       100
weighted avg       0.99      0.99      0.99       100

