In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import nltk
import numpy as np
nltk.download('punkt')
%matplotlib inline

data_paths = [f'drive/MyDrive/Data/refsa-sample-data/{x}/use_as_train_{x}.csv' for x in ["ERIS", "Isoflavones", "Bacillus"]]
validation_paths =  [f'drive/MyDrive/Data/refsa-sample-data/{x}/use_as_validation_{x}.csv' for x in ["ERIS", "Isoflavones", "Bacillus"]]

df = pd.read_csv(data_paths[0], index_col=0)
df_test = pd.read_csv(validation_paths[0], index_col=0)

print(df.shape)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
(527, 5)


Unnamed: 0,X...Author,Title,Abstract,Journal,Indicator
377,"O. Wessel, C. M. Olsen, E. Rimstad and M. K. D...",Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,Vet Res,-1
172,"M. C. Hikke, C. Geertsema, V. Wu, S. W. Metz, ...",Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,Biotechnol J,-1
248,"J. Nacher-Mestre, R. Serrano, E. Beltran, J. P...",Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,Chemosphere,1
525,"B. D. Johnson, S. L. Gilbert, B. Khan, D. L. C...","Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,Mar Environ Res,1
614,"L. Zi-qi, M. Qian, S. Feng-qing, W. Ying-wen, ...",Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...",Food & Machinery,-1


In [8]:
#check for null values
df.isnull().sum()

X...Author    0
Title         0
Abstract      0
Journal       1
Indicator     0
dtype: int64

In [9]:
X = df[["Title", "Abstract"]].copy()
y = df["Indicator"]

X_test = df_test[["Title", "Abstract"]]
y_test = df_test["Indicator"]

Cleaning text data
====

In [10]:
for abs in df["Abstract"][:4]:
    print(abs + "\n")

Piscine orthoreovirus (PRV) is a reovirus that has predominantly been detected in Atlantic salmon (Salmo salar L.). PRV is associated with heart and skeletal muscle inflammation (HSMI) in farmed Atlantic salmon, and recently erythrocytes were identified as major target cells. The study of PRV replication and pathogenesis of the infection has been impeded by the inability to propagate PRV in vitro. In this study we developed an ex vivo cultivation system for PRV in Atlantic salmon erythrocytes. PRV was successfully passaged to naive erythrocytes using lysates of blood cells from infected salmon. During cultivation a significant increase in viral load was observed by RT-qPCR and flow cytometry, which coincided with the formation of cytoplasmic inclusions. The inclusions resembled viral factories and contained both PRV protein and dsRNA. In addition, the erythrocytes generated an antiviral immune gene activation after PRV infection, with significant up-regulation of IFN-alpha, RIG-I, Mx a

Things to consider when cleaning the data:
- abreviations
- numeric values and citation numbers
- hyphenated words like *mosquito-borne*

There is probably no need to remove contractions due to scientific nature of the data. 


In [11]:
#tokenize and lowercase
from nltk.tokenize import word_tokenize

def tokenize_and_lower(s):
    return [x.lower() for x in word_tokenize(s)]

X["tokenized_title"] = X["Title"].apply(tokenize_and_lower)
X["tokenized_abstract"] = X["Abstract"].apply(tokenize_and_lower)
X.head()

Unnamed: 0,Title,Abstract,tokenized_title,tokenized_abstract
377,Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,"[piscine, orthoreovirus, (, prv, ), replicates...","[piscine, orthoreovirus, (, prv, ), is, a, reo..."
172,Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,"[alphavirus, capsid, proteins, self-assemble, ...","[the, mosquito-borne, chikungunya, virus, (, c..."
248,Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,"[occurrence, and, potential, transfer, of, myc...","[plant, ingredients, and, processed, animal, p..."
525,"Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,"[cellular, responses, of, eastern, oysters, ,,...","[because, of, the, continued, development, and..."
614,Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...","[research, progress, of, chitosan, and, its, d...","[recently, ,, the, problem, of, heavy, metal, ..."


In [12]:
#remove punctuation and stopwords
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

chars_for_removal = set(list(string.punctuation) + stopwords.words("english"))
print(chars_for_removal)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
{']', '`', 'aren', 'yourselves', 'it', 'about', 'didn', 'll', "that'll", 'how', '(', 'ma', 'by', 'both', 'yours', 'you', ')', 'doing', 'then', 'weren', 'isn', 'have', 'only', 're', 'herself', "didn't", ':', 'some', 'in', '%', 'm', 'whom', 'more', 'if', '^', 'while', 'its', '*', 'your', 'the', "wouldn't", 'hers', 'down', 'or', "shan't", '\\', 'being', 'few', '<', 'ourselves', "she's", 'don', 'haven', 'them', 'there', 'we', 'for', 'over', '#', '@', "you've", 'himself', 'between', 'i', 'such', 'own', 'o', '{', 'had', 'a', 'ain', '|', 'now', 't', 'nor', 'am', "you're", 'this', "hadn't", 'than', 'which', 'shan', 'out', 'so', 'off', "doesn't", 'do', 'each', 'wasn', 'myself', 'her', "it's", 'did', 'above', "don't", 'y', 'hadn', 'mustn', "hasn't", '_', 'couldn', 'below', 'he', "you'd", 'been', 'again', 'their', 'needn', 'she', 'was', 'here', 'other', 'theirs', 'until', 'will', 'on', '

In [13]:
def filter_words(string_list):
    return [word for word in string_list if word not in chars_for_removal]

X["filtered_title"] = X["tokenized_title"].apply(filter_words)
X["filtered_abstract"] = X["tokenized_abstract"].apply(filter_words)
X.head()

Unnamed: 0,Title,Abstract,tokenized_title,tokenized_abstract,filtered_title,filtered_abstract
377,Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,"[piscine, orthoreovirus, (, prv, ), replicates...","[piscine, orthoreovirus, (, prv, ), is, a, reo...","[piscine, orthoreovirus, prv, replicates, atla...","[piscine, orthoreovirus, prv, reovirus, predom..."
172,Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,"[alphavirus, capsid, proteins, self-assemble, ...","[the, mosquito-borne, chikungunya, virus, (, c...","[alphavirus, capsid, proteins, self-assemble, ...","[mosquito-borne, chikungunya, virus, chikv, ca..."
248,Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,"[occurrence, and, potential, transfer, of, myc...","[plant, ingredients, and, processed, animal, p...","[occurrence, potential, transfer, mycotoxins, ...","[plant, ingredients, processed, animal, protei..."
525,"Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,"[cellular, responses, of, eastern, oysters, ,,...","[because, of, the, continued, development, and...","[cellular, responses, eastern, oysters, crasso...","[continued, development, production, variety, ..."
614,Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...","[research, progress, of, chitosan, and, its, d...","[recently, ,, the, problem, of, heavy, metal, ...","[research, progress, chitosan, derivatives, re...","[recently, problem, heavy, metal, pollution, a..."


Im not sure whether stemming the data would be a good choice - will domain-specific words get well lematized?



In [14]:
#adding pos tags for lemmatization with wordnet
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
X["pos_title"] = X["filtered_title"].apply(nltk.tag.pos_tag)
X["pos_abstract"] = X["filtered_abstract"].apply(nltk.tag.pos_tag)

def map_to_wordnet_pos(tuple_list):
    return [(word, get_wnet_tag(tag)) for (word, tag) in tuple_list]
 
def get_wnet_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
X["wntag_title"] = X["pos_title"].apply(map_to_wordnet_pos)
X["wntag_abstract"] = X["pos_abstract"].apply(map_to_wordnet_pos)    

X.head()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Unnamed: 0,Title,Abstract,tokenized_title,tokenized_abstract,filtered_title,filtered_abstract,pos_title,pos_abstract,wntag_title,wntag_abstract
377,Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,"[piscine, orthoreovirus, (, prv, ), replicates...","[piscine, orthoreovirus, (, prv, ), is, a, reo...","[piscine, orthoreovirus, prv, replicates, atla...","[piscine, orthoreovirus, prv, reovirus, predom...","[(piscine, NN), (orthoreovirus, NN), (prv, NN)...","[(piscine, NN), (orthoreovirus, NN), (prv, NN)...","[(piscine, n), (orthoreovirus, n), (prv, n), (...","[(piscine, n), (orthoreovirus, n), (prv, n), (..."
172,Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,"[alphavirus, capsid, proteins, self-assemble, ...","[the, mosquito-borne, chikungunya, virus, (, c...","[alphavirus, capsid, proteins, self-assemble, ...","[mosquito-borne, chikungunya, virus, chikv, ca...","[(alphavirus, NN), (capsid, NN), (proteins, VB...","[(mosquito-borne, JJ), (chikungunya, NN), (vir...","[(alphavirus, n), (capsid, n), (proteins, v), ...","[(mosquito-borne, a), (chikungunya, n), (virus..."
248,Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,"[occurrence, and, potential, transfer, of, myc...","[plant, ingredients, and, processed, animal, p...","[occurrence, potential, transfer, mycotoxins, ...","[plant, ingredients, processed, animal, protei...","[(occurrence, NN), (potential, NN), (transfer,...","[(plant, NN), (ingredients, NNS), (processed, ...","[(occurrence, n), (potential, n), (transfer, n...","[(plant, n), (ingredients, n), (processed, v),..."
525,"Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,"[cellular, responses, of, eastern, oysters, ,,...","[because, of, the, continued, development, and...","[cellular, responses, eastern, oysters, crasso...","[continued, development, production, variety, ...","[(cellular, JJ), (responses, NNS), (eastern, J...","[(continued, JJ), (development, NN), (producti...","[(cellular, a), (responses, n), (eastern, a), ...","[(continued, a), (development, n), (production..."
614,Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...","[research, progress, of, chitosan, and, its, d...","[recently, ,, the, problem, of, heavy, metal, ...","[research, progress, chitosan, derivatives, re...","[recently, problem, heavy, metal, pollution, a...","[(research, NN), (progress, NN), (chitosan, JJ...","[(recently, RB), (problem, NN), (heavy, JJ), (...","[(research, n), (progress, n), (chitosan, a), ...","[(recently, r), (problem, n), (heavy, a), (met..."


In [15]:
#lemmatize with wordnet
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

def lemmatize(tuple_list):
    return [lemmatizer.lemmatize(word, tag) for (word, tag) in tuple_list]

X["lem_title"] = X["wntag_title"].apply(lemmatize)
X["lem_abstract"] = X["wntag_abstract"].apply(lemmatize)

cleaned_text = pd.DataFrame(data=X[["lem_title", "lem_abstract"]], index = X.index)
cleaned_text = cleaned_text.rename(columns={"lem_title":"title", "lem_abstract": "abstract"})

Exploratory Data Analysis
===

Classification
===


In [16]:
#baseline classifier - always return majority class
class BaselineClassifier:
    def __init__(self):
        self.majority_class = np.NaN
        
    def fit(self, x, y):
        self.majority_class = y.value_counts()[:1].index.to_list()[0]
        return
    def predict(self, x):
        return pd.Series(np.ones((x.shape[0],)) * self.majority_class, index = x.index)
    
    def score(self, x, y):
        p = self.predict(x)
        comp = y==p
        return len(comp[comp==True])/len(comp)
        
        
clf = BaselineClassifier()
clf.fit(X,y)
pred = clf.predict(X_test)
score = clf.score(X_test, y_test)
print(f'Baseline classifier\'s accuracy (always returns majority class): {score}')

Baseline classifier's accuracy (always returns majority class): 0.8484848484848485


**Baseline classifier** achieves accuracy that is pretty impresive at a glance - 84,84% of examples are corretly classifies which corresponds with the expectations. 

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

#preprocess train data
vectorizerAb = CountVectorizer()
vectorizerTit = CountVectorizer()

preprocessedAbstract = vectorizerAb.fit_transform(cleaned_text["abstract"].values)
abstracts = pd.DataFrame(data=preprocessedAbstract.toarray(), index=cleaned_text.index)

preprocessedTitle = vectorizerTit.fit_transform(cleaned_text["Title"].values)
titles = pd.DataFrame(data=preprocessedTitle.toarray(), index=cleaned_text.index)

x_train = pd.concat([abstracts, titles], axis=1)
print(x_train.shape, y.shape, titles.shape, abstracts.shape)

AttributeError: ignored

In [None]:
#preprocess test data
abstracts_test = vectorizerAb.transform(X_test["Abstract"].values)
titles_test = vectorizerTit.transform(X_test["Title"].values)

prep_ab = pd.DataFrame(data=abstracts_test.toarray(), index = X_test.index)
prep_tit = pd.DataFrame(data=titles_test.toarray(), index=X_test.index)

x_test = pd.concat([prep_ab, prep_tit],axis=1)
print(x_test.shape, prep_tit.shape, prep_ab.shape)

In [None]:
from sklearn.linear_model import LogisticRegressionCV
scores = {}

clf = LogisticRegressionCV()
clf.fit(x_train, y)
score = clf.score(x_test, y_test)
print(f'Accuracy for Logistic Regression Classifier with CountVectorization - {score}')

The result might seem impressive at a glance, but when we look at the accuracy of baseline classifier we can see that logistic regression classifier did not bring anything new to the table, its only as good as not looking at the data at all and then guessing majority class. 

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC()
clf.fit(x_train, y)
score = clf.score(x_test, y_test)
print(f'Accuracy for SupportVectorClassifier with CountVectorization - {score}')

In [None]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

clf = DecisionTreeClassifier()
clf.fit(x_train, y)
score = clf.score(x_test, y_test)
print(f'Accuracy for DecisionTreeClassifier with CountVectorization - {score}')



clf = RandomForestClassifier()
clf.fit(x_train, y)
score = clf.score(x_test, y_test)
print(f'Accuracy for RandomForestClassifier with CountVectorization - {score}')



clf = AdaBoostClassifier()
clf.fit(x_train, y)
score = clf.score(x_test, y_test)
print(f'Accuracy for Tree-based AdaBoost with CountVectorization - {score}')