# Text preprocessing

In [None]:
# Importing necessary packages
import pandas as pd
import numpy as np
import re, string
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

from imblearn.under_sampling import RandomUnderSampler

import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [None]:
# Reading in the data
data1 = pd.read_csv("medical_tc_train.csv")
data2 = pd.read_csv("medical_tc_test.csv")

data = pd.concat([data1,data2])

In [None]:
# Checking for any NAs
data.isna().sum()

In [None]:
# Checking class distribution - unbalanced
data['condition_label'].value_counts()

In [None]:
# Checking average word count in the classes
data['word_count'] = data['medical_abstract'].apply(lambda x: len(str(x).split()))
print(data[data['condition_label']==1]['word_count'].mean())
print(data[data['condition_label']==2]['word_count'].mean())
print(data[data['condition_label']==3]['word_count'].mean())
print(data[data['condition_label']==4]['word_count'].mean())
print(data[data['condition_label']==5]['word_count'].mean())

In [None]:
# Define functions for preprocessing
def preprocess(text):
    text = text.lower() #lowercase
    text=text.strip()  #remove leading or following whitespace
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text)  #remove punctuation
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip()) # remove special characters
    text = re.sub(r'\d',' ',text) # remove numbers
    text = re.sub(r'\s+',' ',text) # normalize spacing
    return text


# Removing stopwords
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

# Lemmatization
wl = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenize the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string)) # position tags
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)] # map the position tag and lemmatize the word/token
    return " ".join(a)

In [None]:
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

In [None]:
# Apply preprocessing to the dataset
data['clean_text'] = data['medical_abstract'].apply(lambda x: finalpreprocess(x))
data['word_list'] = data['clean_text'].apply(lambda x: x.split())

In [None]:
# Perform undersampling
X = data['clean_text']
y = data['condition_label']

X_resampled, y_resampled = RandomUnderSampler(random_state=11).fit_resample(X.values.reshape(-1, 1), y)

df_resampled = pd.DataFrame({'clean_text': X_resampled.flatten(), 'condition_label': y_resampled})

df_resampled['condition_label'].value_counts()
df_resampled['word_list'] = df_resampled['clean_text'].apply(lambda x: x.split())

In [None]:
# Remove general column for better classification
df_resampled = df_resampled[df_resampled["condition_label"] != 5]

In [None]:
# Save datasets
data.to_csv('preprocessed.csv', index=False)
df_resampled.to_csv('balanced.csv', index = False)

In [None]:
# Create word clouds for each class
df_resampled = pd.read_csv('balanced.csv')

cat1 = df_resampled[df_resampled["condition_label"] == 1]["clean_text"]
cat2 = df_resampled[df_resampled["condition_label"] == 2]["clean_text"]
cat3 = df_resampled[df_resampled["condition_label"] == 3]["clean_text"]
cat4 = df_resampled[df_resampled["condition_label"] == 4]["clean_text"]
cat5 = df_resampled[df_resampled["condition_label"] == 5]["clean_text"]

text1 = ' '.join(cat1)
text2 = ' '.join(cat2)
text3 = ' '.join(cat3)
text4 = ' '.join(cat4)
text5 = ' '.join(cat5)

In [None]:
# Color function for word clouds
def custom_color_func(word, font_size, position, orientation, random_state=None, **kwargs):
    grey_value = int(200 - (font_size / 125) * 200)
    return f"rgb({grey_value}, {grey_value}, {grey_value})"

In [None]:
wordcloud1 = WordCloud(
    font_path="/work/cmunrm.ttf",
    width=400,
    height=400,
    color_func=custom_color_func,
    background_color='white',
    max_font_size=125
).generate(text1)

plt.figure(figsize=(5, 5))
plt.imshow(wordcloud1, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
wordcloud2 = WordCloud(
    font_path="/work/cmunrm.ttf",
    width=400,
    height=400,
    color_func=custom_color_func,
    background_color='white',
    max_font_size=125
).generate(text2)

plt.figure(figsize=(5, 5))
plt.imshow(wordcloud2, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
wordcloud3 = WordCloud(
    font_path="/work/cmunrm.ttf",
    width=400,
    height=400,
    color_func=custom_color_func,
    background_color='white',
    max_font_size=125
).generate(text3)

plt.figure(figsize=(5, 5))
plt.imshow(wordcloud3, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
wordcloud4 = WordCloud(
    font_path="/work/cmunrm.ttf",
    width=400,
    height=400,
    color_func=custom_color_func,
    background_color='white',
    max_font_size=125
).generate(text4)

plt.figure(figsize=(5, 5))
plt.imshow(wordcloud4, interpolation='bilinear')
plt.axis('off')
plt.show()