In [42]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [43]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
import nltk
import numpy as np
nltk.download('punkt')
%matplotlib inline

data_paths = [f'drive/MyDrive/Data/refsa-sample-data/{x}/use_as_train_{x}.csv' for x in ["ERIS", "Isoflavones", "Bacillus"]]
validation_paths =  [f'drive/MyDrive/Data/refsa-sample-data/{x}/use_as_validation_{x}.csv' for x in ["ERIS", "Isoflavones", "Bacillus"]]

df = pd.read_csv(data_paths[0], index_col=0)
df_test = pd.read_csv(validation_paths[0], index_col=0)

print(df.shape)
df.head()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
(527, 5)


Unnamed: 0,X...Author,Title,Abstract,Journal,Indicator
377,"O. Wessel, C. M. Olsen, E. Rimstad and M. K. D...",Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,Vet Res,-1
172,"M. C. Hikke, C. Geertsema, V. Wu, S. W. Metz, ...",Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,Biotechnol J,-1
248,"J. Nacher-Mestre, R. Serrano, E. Beltran, J. P...",Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,Chemosphere,1
525,"B. D. Johnson, S. L. Gilbert, B. Khan, D. L. C...","Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,Mar Environ Res,1
614,"L. Zi-qi, M. Qian, S. Feng-qing, W. Ying-wen, ...",Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...",Food & Machinery,-1


In [44]:
#check for null values
df.isnull().sum()

X...Author    0
Title         0
Abstract      0
Journal       1
Indicator     0
dtype: int64

In [45]:
X = df[["Title", "Abstract"]].copy()
y = df["Indicator"]

X_test = df_test[["Title", "Abstract"]]
y_test = df_test["Indicator"]

Cleaning text data
====

In [46]:
for abs in df["Abstract"][:4]:
    print(abs + "\n")

Piscine orthoreovirus (PRV) is a reovirus that has predominantly been detected in Atlantic salmon (Salmo salar L.). PRV is associated with heart and skeletal muscle inflammation (HSMI) in farmed Atlantic salmon, and recently erythrocytes were identified as major target cells. The study of PRV replication and pathogenesis of the infection has been impeded by the inability to propagate PRV in vitro. In this study we developed an ex vivo cultivation system for PRV in Atlantic salmon erythrocytes. PRV was successfully passaged to naive erythrocytes using lysates of blood cells from infected salmon. During cultivation a significant increase in viral load was observed by RT-qPCR and flow cytometry, which coincided with the formation of cytoplasmic inclusions. The inclusions resembled viral factories and contained both PRV protein and dsRNA. In addition, the erythrocytes generated an antiviral immune gene activation after PRV infection, with significant up-regulation of IFN-alpha, RIG-I, Mx a

Things to consider when cleaning the data:
- abreviations
- numeric values and citation numbers
- hyphenated words like *mosquito-borne*

There is probably no need to remove contractions due to scientific nature of the data. 


In [47]:
#tokenize and lowercase
from nltk.tokenize import word_tokenize

def tokenize_and_lower(s):
    return [x.lower() for x in word_tokenize(s)]

X["tokenized_title"] = X["Title"].apply(tokenize_and_lower)
X["tokenized_abstract"] = X["Abstract"].apply(tokenize_and_lower)

X_test["tokenized_title"] = X_test["Title"].apply(tokenize_and_lower)
X_test["tokenized_abstract"] = X_test["Abstract"].apply(tokenize_and_lower)

X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,Title,Abstract,tokenized_title,tokenized_abstract
377,Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,"[piscine, orthoreovirus, (, prv, ), replicates...","[piscine, orthoreovirus, (, prv, ), is, a, reo..."
172,Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,"[alphavirus, capsid, proteins, self-assemble, ...","[the, mosquito-borne, chikungunya, virus, (, c..."
248,Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,"[occurrence, and, potential, transfer, of, myc...","[plant, ingredients, and, processed, animal, p..."
525,"Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,"[cellular, responses, of, eastern, oysters, ,,...","[because, of, the, continued, development, and..."
614,Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...","[research, progress, of, chitosan, and, its, d...","[recently, ,, the, problem, of, heavy, metal, ..."


In [48]:
#remove punctuation and stopwords
import string
from nltk.corpus import stopwords
nltk.download('stopwords')

chars_for_removal = set(list(string.punctuation) + stopwords.words("english"))
print(chars_for_removal)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
{'until', 'my', "haven't", 'haven', 'you', '$', 'that', 'ours', '=', '!', 'mustn', 'does', 'all', 'not', 'are', '@', ';', 'them', 'had', 'so', "shan't", 'doesn', 'an', 'such', 'why', '(', ':', '[', 'very', 'at', "that'll", "doesn't", 'its', 'some', "don't", "you'd", 'mightn', 'off', "you'll", 'while', '*', "mightn't", 'should', '&', 'during', '`', 'yours', 'same', 'yourself', 'in', 'under', '<', 'do', 'again', 'ma', 'out', 'him', '/', 'or', 'herself', 'but', 'there', 'she', 'once', 'been', '|', 'each', 'isn', ',', '#', ']', "'", 'did', 'don', 'i', "she's", '"', 'won', 'any', 'didn', 'is', '.', 'if', "couldn't", 'for', "wouldn't", 'how', 'which', 'now', 'into', 'too', 'up', 'aren', 'be', 'd', 'being', "mustn't", "should've", "won't", 'their', 'those', 'a', 'your', '}', 'over', 'against', 'own', 'the', "hadn't", 'because', 'where', 'hasn', '%', 'itself', 'no', 'ain', 'th

In [49]:
def filter_words(string_list):
    return [word for word in string_list if word not in chars_for_removal]

X["filtered_title"] = X["tokenized_title"].apply(filter_words)
X["filtered_abstract"] = X["tokenized_abstract"].apply(filter_words)

X_test["filtered_title"] = X_test["tokenized_title"].apply(filter_words)
X_test["filtered_abstract"] = X_test["tokenized_abstract"].apply(filter_words)
X.head()

Unnamed: 0,Title,Abstract,tokenized_title,tokenized_abstract,filtered_title,filtered_abstract
377,Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,"[piscine, orthoreovirus, (, prv, ), replicates...","[piscine, orthoreovirus, (, prv, ), is, a, reo...","[piscine, orthoreovirus, prv, replicates, atla...","[piscine, orthoreovirus, prv, reovirus, predom..."
172,Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,"[alphavirus, capsid, proteins, self-assemble, ...","[the, mosquito-borne, chikungunya, virus, (, c...","[alphavirus, capsid, proteins, self-assemble, ...","[mosquito-borne, chikungunya, virus, chikv, ca..."
248,Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,"[occurrence, and, potential, transfer, of, myc...","[plant, ingredients, and, processed, animal, p...","[occurrence, potential, transfer, mycotoxins, ...","[plant, ingredients, processed, animal, protei..."
525,"Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,"[cellular, responses, of, eastern, oysters, ,,...","[because, of, the, continued, development, and...","[cellular, responses, eastern, oysters, crasso...","[continued, development, production, variety, ..."
614,Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...","[research, progress, of, chitosan, and, its, d...","[recently, ,, the, problem, of, heavy, metal, ...","[research, progress, chitosan, derivatives, re...","[recently, problem, heavy, metal, pollution, a..."


Im not sure whether stemming the data would be a good choice - will domain-specific words get well lematized?



In [50]:
#adding pos tags for lemmatization with wordnet
from nltk.corpus import wordnet
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
X["pos_title"] = X["filtered_title"].apply(nltk.tag.pos_tag)
X["pos_abstract"] = X["filtered_abstract"].apply(nltk.tag.pos_tag)

X_test["pos_title"] = X_test["filtered_title"].apply(nltk.tag.pos_tag)
X_test["pos_abstract"] = X_test["filtered_abstract"].apply(nltk.tag.pos_tag)

def map_to_wordnet_pos(tuple_list):
    return [(word, get_wnet_tag(tag)) for (word, tag) in tuple_list]
 
def get_wnet_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    if tag.startswith('V'):
        return wordnet.VERB
    if tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
X_test["wntag_title"] = X_test["pos_title"].apply(map_to_wordnet_pos)
X_test["wntag_abstract"] = X_test["pos_abstract"].apply(map_to_wordnet_pos)  
X["wntag_title"] = X["pos_title"].apply(map_to_wordnet_pos)
X["wntag_abstract"] = X["pos_abstract"].apply(map_to_wordnet_pos)   

X.head()


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Unnamed: 0,Title,Abstract,tokenized_title,tokenized_abstract,filtered_title,filtered_abstract,pos_title,pos_abstract,wntag_title,wntag_abstract
377,Piscine orthoreovirus (PRV) replicates in Atla...,Piscine orthoreovirus (PRV) is a reovirus that...,"[piscine, orthoreovirus, (, prv, ), replicates...","[piscine, orthoreovirus, (, prv, ), is, a, reo...","[piscine, orthoreovirus, prv, replicates, atla...","[piscine, orthoreovirus, prv, reovirus, predom...","[(piscine, NN), (orthoreovirus, NN), (prv, NN)...","[(piscine, NN), (orthoreovirus, NN), (prv, NN)...","[(piscine, n), (orthoreovirus, n), (prv, n), (...","[(piscine, n), (orthoreovirus, n), (prv, n), (..."
172,Alphavirus capsid proteins self-assemble into ...,The mosquito-borne chikungunya virus (CHIKV) c...,"[alphavirus, capsid, proteins, self-assemble, ...","[the, mosquito-borne, chikungunya, virus, (, c...","[alphavirus, capsid, proteins, self-assemble, ...","[mosquito-borne, chikungunya, virus, chikv, ca...","[(alphavirus, NN), (capsid, NN), (proteins, VB...","[(mosquito-borne, JJ), (chikungunya, NN), (vir...","[(alphavirus, n), (capsid, n), (proteins, v), ...","[(mosquito-borne, a), (chikungunya, n), (virus..."
248,Occurrence and potential transfer of mycotoxin...,Plant ingredients and processed animal protein...,"[occurrence, and, potential, transfer, of, myc...","[plant, ingredients, and, processed, animal, p...","[occurrence, potential, transfer, mycotoxins, ...","[plant, ingredients, processed, animal, protei...","[(occurrence, NN), (potential, NN), (transfer,...","[(plant, NN), (ingredients, NNS), (processed, ...","[(occurrence, n), (potential, n), (transfer, n...","[(plant, n), (ingredients, n), (processed, v),..."
525,"Cellular responses of eastern oysters, Crassos...",Because of the continued development and produ...,"[cellular, responses, of, eastern, oysters, ,,...","[because, of, the, continued, development, and...","[cellular, responses, eastern, oysters, crasso...","[continued, development, production, variety, ...","[(cellular, JJ), (responses, NNS), (eastern, J...","[(continued, JJ), (development, NN), (producti...","[(cellular, a), (responses, n), (eastern, a), ...","[(continued, a), (development, n), (production..."
614,Research progress of chitosan and its derivati...,"Recently, the problem of heavy metal pollution...","[research, progress, of, chitosan, and, its, d...","[recently, ,, the, problem, of, heavy, metal, ...","[research, progress, chitosan, derivatives, re...","[recently, problem, heavy, metal, pollution, a...","[(research, NN), (progress, NN), (chitosan, JJ...","[(recently, RB), (problem, NN), (heavy, JJ), (...","[(research, n), (progress, n), (chitosan, a), ...","[(recently, r), (problem, n), (heavy, a), (met..."


In [51]:
#lemmatize with wordnet
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

def lemmatize(tuple_list):
    return [lemmatizer.lemmatize(word, tag) for (word, tag) in tuple_list]

X["lem_title"] = X["wntag_title"].apply(lemmatize)
X["lem_abstract"] = X["wntag_abstract"].apply(lemmatize)

X_test["lem_title"] = X_test["wntag_title"].apply(lemmatize)
X_test["lem_abstract"] = X_test["wntag_abstract"].apply(lemmatize)

cleaned_text = pd.DataFrame(data=X[["lem_title", "lem_abstract"]], index = X.index)
cleaned_text = cleaned_text.rename(columns={"lem_title":"title", "lem_abstract": "abstract"})

cleaned_text_test = pd.DataFrame(data=X_test[["lem_title", "lem_abstract"]], index = X_test.index)
cleaned_text_test = cleaned_text_test.rename(columns={"lem_title":"title", "lem_abstract": "abstract"})

cleaned_text.head()

Unnamed: 0,title,abstract
377,"[piscine, orthoreovirus, prv, replicate, atlan...","[piscine, orthoreovirus, prv, reovirus, predom..."
172,"[alphavirus, capsid, proteins, self-assemble, ...","[mosquito-borne, chikungunya, virus, chikv, ca..."
248,"[occurrence, potential, transfer, mycotoxins, ...","[plant, ingredient, process, animal, protein, ..."
525,"[cellular, response, eastern, oyster, crassost...","[continued, development, production, variety, ..."
614,"[research, progress, chitosan, derivative, rem...","[recently, problem, heavy, metal, pollution, a..."


In [52]:
# connect lists back into strings
def list_to_string (str_list):
  return " ".join(str_list)
cleaned_text["abs_str"] = cleaned_text["abstract"].apply(list_to_string)
cleaned_text["tit_str"] = cleaned_text["title"].apply(list_to_string)

cleaned_text_test["abs_str"] = cleaned_text_test["abstract"].apply(list_to_string)
cleaned_text_test["tit_str"] = cleaned_text_test["title"].apply(list_to_string)

cleaned_text = cleaned_text.drop(labels = ["abstract", "title"], axis=1)
cleaned_text_test = cleaned_text_test.drop(labels = ["abstract", "title"], axis=1)

print(cleaned_text.head())


Unnamed: 0,abs_str,tit_str
377,piscine orthoreovirus prv reovirus predominant...,piscine orthoreovirus prv replicate atlantic s...
172,mosquito-borne chikungunya virus chikv cause a...,alphavirus capsid proteins self-assemble core-...
248,plant ingredient process animal protein pap su...,occurrence potential transfer mycotoxins gilth...
525,continued development production variety nanom...,cellular response eastern oyster crassostrea v...
614,recently problem heavy metal pollution attract...,research progress chitosan derivative removal ...


In [71]:
pd.concat([cleaned_text, y], axis=1).to_csv("drive/MyDrive/Data/refsa-sample-data/cleaned/cleaned_train_ERIS.csv")
pd.concat([cleaned_text_test, y_test], axis=1).to_csv("drive/MyDrive/Data/refsa-sample-data/cleaned/cleaned_test_ERIS.csv")

Exploratory Data Analysis
===

Classification
===


In [53]:
#baseline classifier - always return majority class
class BaselineClassifier:
    def __init__(self):
        self.majority_class = np.NaN
        
    def fit(self, x, y):
        self.majority_class = y.value_counts()[:1].index.to_list()[0]
        return
    def predict(self, x):
        return pd.Series(np.ones((x.shape[0],)) * self.majority_class, index = x.index)
    
    def score(self, x, y):
        p = self.predict(x)
        comp = y==p
        return len(comp[comp==True])/len(comp)
        
        
baseline = BaselineClassifier()
baseline.fit(X,y)
pred = baseline.predict(X_test)
score = baseline.score(X_test, y_test)
print(f'Baseline classifier\'s accuracy (always returns majority class): {score}')

Baseline classifier's accuracy (always returns majority class): 0.8484848484848485


**Baseline classifier** achieves expected accuracy -  84,84% which corresponds to how many of training examples are of majority class. 

In [63]:
from sklearn.feature_extraction.text import CountVectorizer

#preprocess train data
vectorizerAb = CountVectorizer()
vectorizerTit = CountVectorizer()

preprocessedAbstract = vectorizerAb.fit_transform(cleaned_text["abs_str"].values)
abstracts = pd.DataFrame(data=preprocessedAbstract.toarray(), index=cleaned_text.index)

preprocessedTitle = vectorizerTit.fit_transform(cleaned_text["tit_str"].values)
titles = pd.DataFrame(data=preprocessedTitle.toarray(), index=cleaned_text.index)

x_train = pd.concat([abstracts, titles], axis=1)
print(x_train.shape, y.shape, titles.shape, abstracts.shape)

print(X_test.head())
print(x_train.head())

(527, 11763) (527,) (527, 2308) (527, 9455)
                                                 Title  ...                                       lem_abstract
374  First in-depth analysis of the novel Th2-type ...  ...  [il-4, il-13, closely, related, canonical, typ...
304  Dai Nippon Printing Co., Ltd Medi.Ca CC for En...  ...  [ready-made, dry, medium, method, coliform, co...
247  Investigation of pharmaceuticals in processed ...  ...  [on-going, trend, develop, sustainable, salmon...
125  Molecular characterisation and functional anal...  ...  [salmon, louse, lepeophtheirus, salmonis, spp,...
100  The use of food waste-based diets and Napier g...  ...  [present, study, use, commercial, feed, food, ...

[5 rows x 12 columns]
     0     1     2     3     4     5     ...  2302  2303  2304  2305  2306  2307
377     0     0     0     0     0     0  ...     0     0     0     0     0     0
172     0     0     0     0     0     0  ...     0     0     0     0     0     0
248     0     0     0   

In [64]:
#preprocess test data
abstracts_test = vectorizerAb.transform(cleaned_text_test["abs_str"].values)
titles_test = vectorizerTit.transform(cleaned_text_test["tit_str"].values)

prep_ab = pd.DataFrame(data=abstracts_test.toarray(), index = cleaned_text_test.index)
prep_tit = pd.DataFrame(data=titles_test.toarray(), index=cleaned_text_test.index)

x_test = pd.concat([prep_ab, prep_tit],axis=1)
print(x_test.shape, prep_tit.shape, prep_ab.shape)

(132, 11763) (132, 2308) (132, 9455)


In [65]:
from sklearn.linear_model import LogisticRegressionCV
scores = {}

logistic = LogisticRegressionCV()
logistic.fit(x_train, y)
score_logistic = logistic.score(x_test, y_test)
print(f'Accuracy for Logistic Regression Classifier with CountVectorization - {score_logistic}')

Accuracy for Logistic Regression Classifier with CountVectorization - 0.8181818181818182


In [66]:
from sklearn.svm import LinearSVC
linearSVC = LinearSVC()
linearSVC.fit(x_train, y)
score_lSVC = linearSVC.score(x_test, y_test)
print(f'Accuracy for SupportVectorClassifier with CountVectorization - {score_lSVC}')

Accuracy for SupportVectorClassifier with CountVectorization - 0.7954545454545454


In [67]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier()
tree.fit(x_train, y)
score_tree = tree.score(x_test, y_test)
print(f'Accuracy for DecisionTreeClassifier with CountVectorization - {score_tree}')



forest = RandomForestClassifier()
forest.fit(x_train, y)
score_forest = forest.score(x_test, y_test)
print(f'Accuracy for RandomForestClassifier with CountVectorization - {score_forest}')



ada_boost = AdaBoostClassifier()
ada_boost.fit(x_train, y)
score_ada = ada_boost.score(x_test, y_test)
print(f'Accuracy for Tree-based AdaBoost with CountVectorization - {score_ada}')

Accuracy for DecisionTreeClassifier with CountVectorization - 0.75
Accuracy for RandomForestClassifier with CountVectorization - 0.8484848484848485
Accuracy for Tree-based AdaBoost with CountVectorization - 0.8257575757575758


**Try 1**

All of the classifiers performed not better than the baseline classifier (accuracy of 84,85%). This is a disapointing but fully predictable result that is probably caused by the class imbalance.

Error analysis
====
Let's see what are the mistakes made by the new classifiers, based on results of Random Forest classification