In [1]:
import pickle
import re
import nltk
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import chi2

In [7]:
#df_path = "/Users/mac/Desktop/MetagenomicToolsClassifier/Abstracts/Abstracts.csv"
df_path = "/Users/kaoutar/Desktop/MetagenomicToolsClassifier/Abstracts+Methods.csv"

df = pd.read_csv(df_path, sep=';')

In [8]:
df['Content'][1]

'RAPSearch: a fast protein similarity search tool for short reads\nNext Generation Sequencing (NGS) is producing enormous corpuses of short DNA reads, affecting emerging fields like metagenomics. Protein similarity search--a key step to achieve annotation of protein-coding genes in these short reads, and identification of their biological functions--faces daunting challenges because of the very sizes of the short read datasets. We developed a fast protein similarity search tool RAPSearch that utilizes a reduced amino acid alphabet and suffix array to detect seeds of flexible length. For short reads (translated in 6 frames) we tested, RAPSearch achieved ~20-90 times speedup as compared to BLASTX. RAPSearch missed only a small fraction (~1.3-3.2%) of BLASTX similarity hits, but it also discovered additional homologous proteins (~0.3-2.1%) that BLASTX missed. By contrast, BLAT, a tool that is even slightly faster than RAPSearch, had significant loss of sensitivity as compared to RAPSearch

# Preprocessing

In [9]:
# Lowercasing the text
df['Content_Parsed_1'] = df['Content'].str.lower()

# removing links
regex_link = r"\bhttp[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+\b"
df['Content_Parsed_2'] = df['Content_Parsed_1'].str.replace(regex_link, "")

# removing numbers
regex_nums = r"\b[0-9][0-9]*\b"
df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(regex_nums, "")

# removing special characters
special_character = list("←=()[]/‘’|><\\∼+%$&×–−-·")
for spec_char in special_character:
    df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(spec_char, '')


# removing punctuation
punctuation_signs = list("?:!.,;")
for punct_sign in punctuation_signs:
    df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(punct_sign, '') 
    
# removing strings with length 1-2
regex_short = r"\b\w{0,2}\b"
df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(regex_short, "")    

# removing strings starting with numbers
regex_short = r"\b[0-9][0-9]*\w\b"
df['Content_Parsed_2'] = df['Content_Parsed_2'].str.replace(regex_short, "")


# Lemmatization
# Downloading punkt and wordnet from NLTK
# nltk.download('punkt')
print("------------------------------------------------------------")
nltk.download('wordnet')
# Saving the lemmatizer into an object
wordnet_lemmatizer = WordNetLemmatizer()
# Iterating through every word to lemmatize
nrows = len(df)
lemmatized_text_list = []
for row in range(0, nrows):
    
    # Create an empty list containing lemmatized words
    lemmatized_list = []
    
    # Save the text and its words into an object
    text = df.loc[row]['Content_Parsed_2']
    text_words = text.split(" ")

    # Iterate through every word to lemmatize
    for word in text_words:
        lemmatized_list.append(wordnet_lemmatizer.lemmatize(word, pos="v"))
        
    # Join the list
    lemmatized_text = " ".join(lemmatized_list)
    
    # Append to the list containing the texts
    lemmatized_text_list.append(lemmatized_text)
df['Content_Parsed_3'] = lemmatized_text_list

# removing possessive pronoun terminations
df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace("'s", "")

# removing english stop words
# Downloading the stop words list
nltk.download('stopwords')
# Loading the stop words in english
stop_words = list(stopwords.words('english'))
# looping through all stop words
for stop_word in stop_words:
    regex_stopword = r"\b" + stop_word + r"\b"
    df['Content_Parsed_3'] = df['Content_Parsed_3'].str.replace(regex_stopword, '')


------------------------------------------------------------


[nltk_data] Downloading package wordnet to /Users/kaoutar/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/kaoutar/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
list_columns = ["File_Name", "Content","Category", "Content_Parsed_3"]
df = df[list_columns]
df = df.rename(columns={'Content_Parsed_3': 'Content_Parsed'})

In [13]:
category_codes = {
    'Alignment': 0,
    'Classification': 1,
    'VirusDetection': 2,
    'VirusIdentification': 3,
    'Mapping': 4,
    'Assembly': 5,
    'AbundanceEstimation': 6,
    'Trimming': 7,
    'QualityControl': 8,
    'Annotation' : 9,
    'SNPDiscovery' : 10,
    'Visualization' : 11,
    'AssemblyEvaluation' : 12
}

# Category mapping
df['Category_Code'] = df['Category']
df = df.replace({'Category_Code':category_codes})

In [14]:
df['Content_Parsed'][1]

'rapsearch  fast protein similarity search tool  short reads\nnext generation sequence ngs  produce enormous corpuses  short dna read affect emerge field like metagenomics protein similarity searcha key step  achieve annotation  proteincoding genes   short read  identification   biological functionsfaces daunt challenge     size   short read datasets  develop  fast protein similarity search tool rapsearch  utilize  reduce amino acid alphabet  suffix array  detect seed  flexible length  short read translate   frame  test rapsearch achieve ~ time speedup  compare  blastx rapsearch miss   small fraction ~  blastx similarity hit   also discover additional homologous proteins ~  blastx miss  contrast blat  tool   even slightly faster  rapsearch  significant loss  sensitivity  compare  rapsearch  blast  enable faster protein similarity search  application  rapsearch  metageomics  also  demonstrated\nrapsearch adopt  seedextension approach  blast  identify  seed  maximal exact match mems   re

In [16]:
list_columns = ["File_Name", "Content_Parsed", "Category_Code"]
df1 = df[list_columns]

In [17]:
df1.sample(5)

Unnamed: 0,File_Name,Content_Parsed,Category_Code
192,SNPServer,snpserver realtime snp discovery tool\nsnpser...,10
151,GRAMMy,accurate genome relative abundance estimation ...,6
178,FastQ Screen,fastq screen tool multigenome map quality c...,8
67,VirSorter,virsorter mine viral signal microbial genomic...,3
103,ALLPATHS,allpaths novo assembly wholegenome shotgun m...,5


In [19]:
# df
with open('/Users/kaoutar/Desktop/MetagenomicToolsClassifier/FE-dfAM.pickle', 'wb') as output:
    pickle.dump(df, output) 