### NLP Pipeline

Let's start from importing basic libraries:

In [1]:
import os

import numpy as np
import pandas as pd
# import tensorflow as tf

import warnings
warnings.filterwarnings("ignore")

Now let's read our train data and create a DataFrame:

In [2]:
train_df = pd.read_csv('./data/drug_review_train.csv')
train_df.head()

Unnamed: 0.1,Unnamed: 0,patient_id,drugName,condition,review,rating,date,usefulCount,review_length
0,0,89879,Cyclosporine,keratoconjunctivitis sicca,"""i have used restasis for about a year now and...",2.0,"April 20, 2013",69,147
1,1,143975,Etonogestrel,birth control,"""my experience has been somewhat mixed. i have...",7.0,"August 7, 2016",4,136
2,2,106473,Implanon,birth control,"""this is my second implanon would not recommen...",1.0,"May 11, 2016",6,140
3,3,184526,Hydroxyzine,anxiety,"""i recommend taking as prescribed, and the bot...",10.0,"March 19, 2012",124,104
4,4,91587,Dalfampridine,multiple sclerosis,"""i have been on ampyra for 5 days and have bee...",9.0,"August 1, 2010",101,74


Downloading libraries, necessary for preprocessing:

In [3]:
!python.exe -m pip install --upgrade pip
!pip install nltk --upgrade --quiet
# !pip install beautifulsoup4 --upgrade --quiet
# !pip install contractions --upgrade --quiet
# 
# !pip install unidecode --upgrade --quiet
# !pip install textblob --upgrade --quiet
# !pip install pyspellchecker --upgrade --quiet

Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/85/8a/1ddf40be20103bcc605db840e9ade09c8e8c9f920a03e9cfe88eae97a058/pip-25.0-py3-none-any.whl.metadata
  Downloading pip-25.0-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-25.0-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
    --------------------------------------- 0.0/1.8 MB ? eta -:--:--
   - -------------------------------------- 0.1/1.8 MB 991.0 kB/s eta 0:00:02
   -- ------------------------------------- 0.1/1.8 MB 1.1 MB/s eta 0:00:02
   --- ------------------------------------ 0.2/1.8 MB 919.0 kB/s eta 0:00:02
   ---- ----------------------------------- 0.2/1.8 MB 985.7 kB/s eta 0:00:02
   ----- ---------------------------------- 0.3/1.8 MB 983.0 kB/s eta 0:00:02
   ------ --------------------------------- 0.3/1.8 MB 947.5 kB/s eta 0:00:02
   ------ --------------------------------- 0.3/1.8 MB 842.9 kB/s eta 0:00:02
   -----

Let's create a new Dataframe for preprocessed data:

In [4]:
prep_df = pd.DataFrame()

prep_df['patient_id'] = train_df['patient_id']
prep_df['review'] = train_df['review']
prep_df['drugName'] = train_df['drugName'].apply(lambda x: x.lower())

Relabeling rating column:

In [5]:
def relabel_rating(rating):
    if 0 <= rating <= 4:
        return 'Negative'
    elif 5 <= rating <= 7:
        return 'Neutral'
    elif 8 <= rating <= 10:
        return 'Positive'

prep_df['rating_category'] = train_df['rating'].apply(relabel_rating)

In [6]:
prep_df.head()

Unnamed: 0,patient_id,review,drugName,rating_category
0,89879,"""i have used restasis for about a year now and...",cyclosporine,Negative
1,143975,"""my experience has been somewhat mixed. i have...",etonogestrel,Neutral
2,106473,"""this is my second implanon would not recommen...",implanon,Negative
3,184526,"""i recommend taking as prescribed, and the bot...",hydroxyzine,Positive
4,91587,"""i have been on ampyra for 5 days and have bee...",dalfampridine,Positive


Now let's import libraries for text preprocessing:

In [8]:
import nltk
nltk.download('punkt_tab')

from nltk import word_tokenize

train_df['review'] = train_df['review'].apply(word_tokenize)
prep_df.head()

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\khrystyna.i.dolynska\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.


Unnamed: 0,patient_id,review,drugName,rating_category
0,89879,"""i have used restasis for about a year now and...",cyclosporine,Negative
1,143975,"""my experience has been somewhat mixed. i have...",etonogestrel,Neutral
2,106473,"""this is my second implanon would not recommen...",implanon,Negative
3,184526,"""i recommend taking as prescribed, and the bot...",hydroxyzine,Positive
4,91587,"""i have been on ampyra for 5 days and have bee...",dalfampridine,Positive


In [9]:
import re
import string
import contractions

import logging

import nltk
from nltk.corpus import stopwords

from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

from bs4 import BeautifulSoup
from unidecode import unidecode
from textblob import TextBlob

nltk.download("stopwords")
sw_nltk = stopwords.words('english')
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger_eng')

lemmatizer = WordNetLemmatizer()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\khrystyna.i.dolynska\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\khrystyna.i.dolynska\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\khrystyna.i.dolynska\AppData\Roaming\nltk_dat
[nltk_data]     a...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger_eng.zip.


In [10]:
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemmatize with POS tagging for lists
# def lemmatize_with_pos(tokens):
#     pos_tags = pos_tag(tokens)
#     lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
#     return lemmatized_tokens

def lemmatize_with_pos(tokens):
    if isinstance(tokens, str):
        tokens = word_tokenize(tokens)

    if not isinstance(tokens, list):
        raise TypeError("tokens: expected a list of strings, got a string")
    
    pos_tags = pos_tag(tokens)
    lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return lemmatized_tokens

In [11]:
class Pipeline:
    def __init__(self, X):
        self.X = X
        
    def to_lower(self):
        # Let's check if first element is a list
        if isinstance(self.X.iloc[0], list):     
            self.X = self.X.apply(lambda tokens: [token.lower() for token in tokens])
        else:
            self.X = self.X.apply(lambda x: x.lower())
        logger.info("Lowercase done")
        return self
    
    def remove_numbers(self):
        if isinstance(self.X.iloc[0], list):
            self.X = self.X.apply(lambda tokens: [re.sub(r'\d+', '', token) for token in tokens])
        else:
            self.X = self.X.apply(lambda x: re.sub(r'\d+', '', x))
        logger.info("Numbers removal done")
        return self

    def remove_dots(self):
        if isinstance(self.X.iloc[0], list):     
            self.X = self.X.apply(lambda tokens: [re.sub("[.]", "", token) for token in tokens])
        else:
            self.X = self.X.apply(lambda x: re.sub("[.]", "", x))
        logger.info("Dots removal done")
        return self
    
    def remove_punctuation(self):
        # '!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~' 32 punctuations in python string module
        if isinstance(self.X.iloc[0], list):     
            self.X = self.X.apply(lambda tokens: [re.sub('[%s]' % re.escape(string.punctuation), '', token) for token in tokens])
        else:
            self.X = self.X.apply(lambda x: re.sub('[%s]' % re.escape(string.punctuation), '', x))
        logger.info("Punctuation removal done")
        return self
    
    def remove_multi_whitespace(self):
        if isinstance(self.X.iloc[0], list):     
            self.X = self.X.apply(lambda tokens: [re.sub(' +', ' ', token) for token in tokens])
        else:
            self.X = self.X.apply(lambda x: re.sub(' +', ' ', x))
        logger.info("Multi whitespaces removal done")
        return self
    
    def expand_contractions(self):
        if isinstance(self.X.iloc[0], list):
            self.X = self.X.apply(
                lambda tokens: [contractions.fix(str(token)) for token in tokens if isinstance(token, str)]
            )
        else: 
            self.X = self.X.apply(
                lambda x: " ".join([contractions.fix(str(word)) for word in x.split() if isinstance(word, str)])
            )
        logger.info("Contractions expand done")
        return self

    # Is this step usable for current dataset?
    def remove_html_tags(self):
        self.X = self.X.apply(
            lambda x: BeautifulSoup(x, 'html.parser').get_text())
        logger.info("HTML tags removal done")
        return self

    def replace_diacritics(self):
        def process_tokens(tokens):
            try:
                return [unidecode(str(token)) for token in tokens]
            except Exception as e:
                logger.info(f"Error processing tokens: {tokens}. Error: {e}")
                return tokens
    
        if isinstance(self.X.iloc[0], list):
            self.X = self.X.apply(lambda tokens: process_tokens(tokens) if isinstance(tokens, list) else tokens)
        else:
            self.X = self.X.apply(lambda x: unidecode(str(x)) if isinstance(x, str) else str(x))
        
        logger.info("Diacritics replacement done")
        return self
    
    def spellcheck(self):
        self.X = self.X.apply(lambda tokens: [str(TextBlob(token).correct()) for token in tokens] if isinstance(tokens, list) else tokens)
        logger.info("Spellcheck done")
        return self
    
    # Will NOT be used for Transformers
    def remove_stopwords(self):
        # Possible to add custom stopwords
        # new_stopwords = ['drugs']
        # sw_nltk.extend(new_stopwords)
        # Possible to remove already existing stopwords
        # sw_nltk.remove('not')
        self.X = self.X.apply(lambda tokens: [token for token in tokens if token not in sw_nltk] if isinstance(tokens, list) else tokens)
        logger.info("Stopwords removal done")
        return self
    
    # Will NOT be used for Transformers
    def lemmatize(self):
        self.X = self.X.apply(lemmatize_with_pos)
        logger.info("Lemmatization done")
        return self

In [12]:
text_preprocessor = Pipeline(train_df['review'])

In [13]:
prep_df['review'] = text_preprocessor.to_lower().remove_numbers().remove_punctuation().remove_multi_whitespace().X

INFO:__main__:Lowercase done
INFO:__main__:Numbers removal done
INFO:__main__:Punctuation removal done
INFO:__main__:Multi whitespaces removal done


In [14]:
prep_df['review'] = text_preprocessor.replace_diacritics().expand_contractions().spellcheck().X

INFO:__main__:Diacritics replacement done
INFO:__main__:Contractions expand done
INFO:__main__:Spellcheck done


In [15]:
prep_df.head()

Unnamed: 0,patient_id,review,drugName,rating_category
0,89879,"[, i, have, used, stasis, for, about, a, year,...",cyclosporine,Negative
1,143975,"[, my, experience, has, been, somewhat, mixed,...",etonogestrel,Neutral
2,106473,"[, this, is, my, second, implanon, would, not,...",implanon,Negative
3,184526,"[, i, recommend, taking, as, prescribed, , and...",hydroxyzine,Positive
4,91587,"[, i, have, been, on, ampyra, for, , days, and...",dalfampridine,Positive


Now let's preprocess our data and save it to .csv file:

In [16]:
prep_df.to_csv('prep_data/drug_review_train_auto_prep_sc.csv')

In [17]:
prep_df['review'] = text_preprocessor.remove_stopwords().lemmatize().X

INFO:__main__:Stopwords removal done
INFO:__main__:Lemmatization done


In [18]:
prep_df.to_csv('prep_data/drug_review_train_auto_prep_full_sc.csv')

In [19]:
prep_2_df = pd.read_csv('./prep_data/drug_review_train_auto_prep.csv')
# prep_df == prep_2_df

ValueError: Can only compare identically-labeled (both index and columns) DataFrame objects