In [2]:
import pickle
import re
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2


In [3]:
df_path = "/Volumes/TOSHIBA EXT/phd/DSS/ToolsClassifier/Datasets/Tools_Abst_9classes.csv"
df = pd.read_csv(df_path, sep=';')


In [6]:
# Lowercasing the text
df['Content_Parsed_1'] = df['Content'].str.lower()

# removing links
regex_link = r"\bhttp[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b"
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.replace(regex_link, "")

# removing numbers
regex_nums = r"\b[0-9][0-9]*\b"
df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(regex_nums, "")

# removing special characters
special_character = list("←=()[]/‘’|><\\∼+%$&×–−-·")
for spec_char in special_character:
    df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(spec_char, '')


# removing punctuation
punctuation_signs = list("?:!.,;")
for punct_sign in punctuation_signs:
    df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(punct_sign, '') 
    
# removing strings with length 1-2
regex_short = r"\b\w{0,2}\b"
df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(regex_short, "")    

# removing strings starting with numbers
regex_short = r"\b[0-9][0-9]*\w\b"
df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(regex_short, "")


# Lemmatization
# Downloading punkt and wordnet from NLTK
# nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()
# Iterating through every word to lemmatize
nrows = len(df)
lemmatized_text_list = []
for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_2']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)
df['Content_Parsed_3'] = lemmatized_text_list

# removing possessive pronoun terminations
df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace("'s", "")

# removing english stop words
# Downloading the stop words list
nltk.download('stopwords')
# Loading the stop words in english
stop_words = list(stopwords.words('english'))
# looping through all stop words
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(regex_stopword, '')


------------------------------------------------------------


[nltk_data] Downloading package wordnet to /Users/mac/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mac/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df.loc[0]['Content']


'The alignment of sequencing reads against a protein reference database is a major computational bottleneck in metagenomics and data-intensive evolutionary projects. Although recent tools offer improved performance over the gold standard BLASTX, they exhibit only a modest speedup or low sensitivity. We introduce DIAMOND, an open-source algorithm based on double indexing that is 20,000 times faster than BLASTX on short reads and has a similar degree of sensitivity.'

In [8]:
df.head()

Unnamed: 0,File_Name,Content,Category,Content_Parsed_1,Content_Parsed_2,Content_Parsed_3
0,Diamond.txt,The alignment of sequencing reads against a pr...,Alignment,the alignment of sequencing reads against a pr...,the alignment sequencing reads against prote...,alignment sequence read protein reference ...
1,Minimap2.txt,Recent advances in sequencing technologies pro...,Alignment,recent advances in sequencing technologies pro...,recent advances sequencing technologies promi...,recent advance sequence technologies promise ...
2,Bowtie.txt,"Bowtie is an ultrafast, memory-efficient align...",Alignment,"bowtie is an ultrafast, memory-efficient align...",bowtie ultrafast memoryefficient alignment p...,bowtie ultrafast memoryefficient alignment p...
3,HISAT.txt,The human reference genome represents only a s...,Alignment,the human reference genome represents only a s...,the human reference genome represents only sm...,human reference genome represent small numb...
4,STAR.txt,Motivation: Accurate alignment of high-through...,Alignment,motivation: accurate alignment of high-through...,motivation accurate alignment highthroughput ...,motivation accurate alignment highthroughput ...


In [9]:
list_columns = ["File_Name", "Content","Category", "Content_Parsed_3"]
df = df[list_columns]
df = df.rename(columns={'Content_Parsed_3': 'Content_Parsed'})

In [10]:
category_codes = {
    'Alignment': 0,
    'Classification': 1,
    'VirusDetection': 2,
    'VirusIdentification': 3,
    'Mapping': 4,
    'Assembly': 5,
    'AbundanceEstimation': 6,
    'Trimming': 7,
    'QualityControl': 8
}

# Category mapping
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df['Content_Parsed'], 
                                                    df['Category_Code'], 
                                                    test_size=0.15, 
                                                    random_state=8)

In [12]:
# TF-IDF Vectors as features
# Parameter election
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 60

tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range=ngram_range,
                        stop_words=None,
                        lowercase=False,
                        max_df=max_df,
                        min_df=min_df,
                        max_features=max_features,
                        norm='l2',
                        sublinear_tf=True)
                        
features_train = tfidf.fit_transform(X_train).toarray()
labels_train = y_train
print(features_train.shape)

features_test = tfidf.transform(X_test).toarray()
labels_test = y_test
print(features_test.shape)

# for the models stratified cross validation
features = tfidf.transform(df['Content_Parsed']).toarray()



(72, 60)
(13, 60)


In [13]:
#most correlated unigrams and bigrams
for Product, category_id in sorted(category_codes.items()):
    features_chi2 = chi2(features_train, labels_train == category_id)
    indices = np.argsort(features_chi2[0])
    feature_names = np.array(tfidf.get_feature_names())[indices]
    unigrams = [v for v in feature_names if len(v.split(' ')) == 1]
    bigrams = [v for v in feature_names if len(v.split(' ')) == 2]
    print("# '{}' category:".format(Product))
    print("  . Most correlated unigrams:\n. {}".format('\n. '.join(unigrams[-5:])))
    print("  . Most correlated bigrams:\n. {}".format('\n. '.join(bigrams[-2:])))
    print("")

# 'AbundanceEstimation' category:
  . Most correlated unigrams:
. species
. approach
. metagenomic
. methods
. abundance
  . Most correlated bigrams:
. sequence data

# 'Alignment' category:
  . Most correlated unigrams:
. short
. map
. human
. algorithm
. alignment
  . Most correlated bigrams:
. sequence data

# 'Assembly' category:
  . Most correlated unigrams:
. alignment
. produce
. genome
. assemble
. assembly
  . Most correlated bigrams:
. sequence data

# 'Classification' category:
  . Most correlated unigrams:
. present
. species
. metagenomic
. metagenomics
. classification
  . Most correlated bigrams:
. sequence data

# 'Mapping' category:
  . Most correlated unigrams:
. new
. short
. alignment
. reference
. map
  . Most correlated bigrams:
. sequence data

# 'QualityControl' category:
  . Most correlated unigrams:
. feature
. ngs
. generate
. trim
. quality
  . Most correlated bigrams:
. sequence data

# 'Trimming' category:
  . Most correlated unigrams:
. reference
. genome

In [14]:
# X_train
with open('Pickles/X_train.pickle', 'wb') as output:
    pickle.dump(X_train, output)
    
# X_test    
with open('Pickles/X_test.pickle', 'wb') as output:
    pickle.dump(X_test, output)
    
# y_train
with open('Pickles/y_train.pickle', 'wb') as output:
    pickle.dump(y_train, output)
    
# y_test
with open('Pickles/y_test.pickle', 'wb') as output:
    pickle.dump(y_test, output)
    
# df
with open('Pickles/df.pickle', 'wb') as output:
    pickle.dump(df, output)
    
# features
with open('Pickles/features.pickle', 'wb') as output:
    pickle.dump(features, output)
    
    
# features_train
with open('Pickles/features_train.pickle', 'wb') as output:
    pickle.dump(features_train, output)

# labels_train
with open('Pickles/labels_train.pickle', 'wb') as output:
    pickle.dump(labels_train, output)

# features_test
with open('Pickles/features_test.pickle', 'wb') as output:
    pickle.dump(features_test, output)

# labels_test
with open('Pickles/labels_test.pickle', 'wb') as output:
    pickle.dump(labels_test, output)
    
# TF-IDF object
with open('Pickles/tfidf.pickle', 'wb') as output:
    pickle.dump(tfidf, output)