## Data Vectorisation and Modelling
##### This code vectorises the text data to numerical format in the features dataset, in preparation for multi-label modelling. 

### 1. Import Data

In [26]:
# Import packages
import pandas as pd
import numpy as np

# Import data
features = pd.read_excel(r"/Users/ishanisahama/Documents/Data Science/github_blog/multi-label classification/output/features_proc.xlsx", index_col=0)
labels = pd.read_excel(r"/Users/ishanisahama/Documents/Data Science/github_blog/multi-label classification/output/labels_proc.xlsx", index_col=0)
features.head()

Unnamed: 0,pmid,keywords
0,8549602,"['cervical', 'cancer', 'hpv', 'expression', 'd..."
1,21736816,"['trimester', 'serum', 'levels', 'women', 'vit..."
2,19060934,"['opioid', 'tyr', 'pro', 'activity', 'atypical..."
3,11426874,"['membrane', 'multilayer', 'encapsulation', 'p..."
4,28323099,"['nanogels', 'obtained', 'cells', 'dox', 'drug..."


### 2. Text Preprocessing

In [27]:
# Import packages
import re

# Preprocess keyword output (removes everything except words and commas)
features["keywords"] = [re.sub(r'[^a-zA-Z, ]', "", str(x)) for x in features["keywords"]]
features.head()


Unnamed: 0,pmid,keywords
0,8549602,"cervical, cancer, hpv, expression, dysplasia, ..."
1,21736816,"trimester, serum, levels, women, vitamin, stat..."
2,19060934,"opioid, tyr, pro, activity, atypical, peptides..."
3,11426874,"membrane, multilayer, encapsulation, pancreati..."
4,28323099,"nanogels, obtained, cells, dox, drug, acryloyl..."


### 3. Prepare input table for Modelling

In [28]:
# Merge dataset
df = pd.merge(features, labels, on="pmid").drop_duplicates()
df.shape


(9996, 6)

In [29]:
df.head()

Unnamed: 0,pmid,keywords,anatomy,organisms,diseases,chemicals_and_drugs
0,8549602,"cervical, cancer, hpv, expression, dysplasia, ...",0,1,1,1
1,21736816,"trimester, serum, levels, women, vitamin, stat...",0,1,1,1
2,19060934,"opioid, tyr, pro, activity, atypical, peptides...",1,1,0,1
3,11426874,"membrane, multilayer, encapsulation, pancreati...",1,1,1,1
4,28323099,"nanogels, obtained, cells, dox, drug, acryloyl...",1,1,0,1


In [30]:
# Transform words to feature vectors 
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(min_df=2, max_df=0.8, ngram_range=(1,1))
X = vectorizer.fit_transform(df["keywords"]).toarray()

from sklearn.feature_extraction.text import TfidfTransformer
tfidfconverter = TfidfTransformer()
X = tfidfconverter.fit_transform(X).toarray()

In [31]:
# Create labels for the modelling input table 
column_names = vectorizer.get_feature_names_out().tolist()
column_names.extend(["target_anatomy", "target_organisms", "target_diseases", "target_chemicals_and_drugs"])

In [32]:
# Grab the target values for the modelling input table
cols = ["anatomy", "organisms", "diseases", "chemicals_and_drugs"]
label = df[cols].values.tolist()

In [33]:
# Create the modelling input table
np_comb = np.column_stack((X, label))
model = pd.DataFrame(np_comb, columns=column_names)
model.head()

Unnamed: 0,aaa,aav,aba,abc,abca,abdomen,abdominal,aberrant,aberrations,abeta,...,zidovudine,zinc,zns,zona,zone,zones,target_anatomy,target_organisms,target_diseases,target_chemicals_and_drugs
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0


### 4. Multi-Label Classification Modelling
In this modelling exercise, MLkNN was used as highlighted in the "modelling_notes_and_caveats.md" file [https://github.com/ishani-ss/Multi-Label-Classification/blob/main/modelling_notes_and_caveats.md] of this repository, due to the affordance of the predict_proba functionality. This functionality provides label predictions through proportions, matching the work problem requirements. 

In [34]:
# Select features (X) and labels (y) datasets
X = model.drop(["target_anatomy", "target_organisms", "target_diseases", "target_chemicals_and_drugs"], axis=1)
y = model[["target_anatomy", "target_organisms", "target_diseases", "target_chemicals_and_drugs"]]

# Convert datasets to sparse format for downstream modelling (keep as "int32" to avoid downstream errors related to incomplete data and sparse matrices)
from scipy.sparse import csr_matrix
X = csr_matrix(X.astype(pd.SparseDtype("int32", 0)).sparse.to_coo())
y = csr_matrix(y.astype(pd.SparseDtype("int32", 0)).sparse.to_coo())

# Train-Test splitting of modelling data (70:30 split)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

In [35]:
# COMPUTATIONALLY EXPENSIVE: Use GridSearchCV to find the optimal set of parameters for MLKNN:
from skmultilearn.adapt import MLkNN
from sklearn.model_selection import GridSearchCV

parameters = {'k': range(3,20), 's': [0.5, 0.75, 1.0]}
score = 'f1_macro'

clf = GridSearchCV(MLkNN(), parameters, scoring=score)
clf.fit(X_train, y_train)

# The "best_score" is the mean cross-validated score of the best_estimator (will need to investigate further)
print (clf.best_params_, clf.best_score_)

# SOURCES:
# http://scikit.ml/api/skmultilearn.adapt.mlknn.html#multilabel-k-nearest-neighbours
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html#sklearn.model_selection.GridSearchCV

{'k': 3, 's': 0.5} 0.6041609751347842


In [36]:
# Build the model
from skmultilearn.adapt import MLkNN

## Initialise model
clf = MLkNN(k=3, s=0.5)

## Train
clf.fit(X_train, y_train)

## Predict
pred = clf.predict(X_test) # Further work will need to be undertaken into "predict_proba" as the number of rows do not match the true labels

## Output metrics
import sklearn.metrics as metrics
metrics.precision_score(y_test, pred, average="weighted")

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0.600589004761517

In [37]:
metrics.recall_score(y_test, pred, average="weighted")

0.8185648452929559

In [38]:
# MODEL SOURCE
'''
@article{zhang2007ml,
  title={ML-KNN: A lazy learning approach to multi-label learning},
  author={Zhang, Min-Ling and Zhou, Zhi-Hua},
  journal={Pattern recognition},
  volume={40},
  number={7},
  pages={2038--2048},
  year={2007},
  publisher={Elsevier}
}
'''

'\n@article{zhang2007ml,\n  title={ML-KNN: A lazy learning approach to multi-label learning},\n  author={Zhang, Min-Ling and Zhou, Zhi-Hua},\n  journal={Pattern recognition},\n  volume={40},\n  number={7},\n  pages={2038--2048},\n  year={2007},\n  publisher={Elsevier}\n}\n'