# Preprocessing

### Step 1: Striping data of any whitespace in the beginning or end of the data

In [1]:
import pandas as pd

df = pd.read_csv("Ethiopia_39_topics.csv")
df['Topic'] = df['Topic'].str.strip()
df['Description'] = df['Description'].str.strip()

### Step 2: Combine Topic and Description

In [2]:
df['Combine_text'] =  df['Topic'] + " " + df["Description"]

### Step 3: Make all the text lowercase for uniformity

In [3]:
df['Combine_text'] = df['Combine_text'].str.lower()

df.head()

Unnamed: 0,Topic,URL,Description,Combine_text
0,Identify risk factors,https://rhlaiservice2.blob.core.windows.net/hm...,A woman is at a risk of getting an infection i...,identify risk factors a woman is at a risk of ...
1,Measure temperature,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to determine the baby's temperature...,measure temperature here's how to determine th...
2,Weigh the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to take the baby's weight? An accur...,weigh the baby here's how to take the baby's w...
3,Examine the baby,https://rhlaiservice2.blob.core.windows.net/hm...,The baby's exam is best done on the first day ...,examine the baby the baby's exam is best done ...
4,Count a baby's breaths,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to count the baby's breaths. Newbor...,count a baby's breaths here's how to count the...


# Lemmetizing text

In [4]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')  # For sentence and word tokenization
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/abhinavkompella/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/abhinavkompella/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/abhinavkompella/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/abhinavkompella/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

### Step 1:

- `get_word_pos(word)`: To find the distiction between types of word.
- `lemmatize_text(text)`" To apply lemmatization to the text.

In [5]:
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk import pos_tag

# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

# Function to convert NLTK POS tags to WordNet POS tags
def get_wordnet_pos(word):
    # Get the part of speech for the word
    tag = pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

# Function to apply lemmatization
def lemmatize_text(text):
    words = word_tokenize(text)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

### Apply lemmatization to the dataframe

In [6]:
# Assuming you have a column named 'combined_text' in the DataFrame
df['lemmatized_text'] = df['Combine_text'].apply(lemmatize_text)

# View the resulting DataFrame
df

Unnamed: 0,Topic,URL,Description,Combine_text,lemmatized_text
0,Identify risk factors,https://rhlaiservice2.blob.core.windows.net/hm...,A woman is at a risk of getting an infection i...,identify risk factors a woman is at a risk of ...,identify risk factor a woman be at a risk of g...
1,Measure temperature,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to determine the baby's temperature...,measure temperature here's how to determine th...,measure temperature here 's how to determine t...
2,Weigh the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to take the baby's weight? An accur...,weigh the baby here's how to take the baby's w...,weigh the baby here 's how to take the baby 's...
3,Examine the baby,https://rhlaiservice2.blob.core.windows.net/hm...,The baby's exam is best done on the first day ...,examine the baby the baby's exam is best done ...,examine the baby the baby 's exam be best do o...
4,Count a baby's breaths,https://rhlaiservice2.blob.core.windows.net/hm...,Here's how to count the baby's breaths. Newbor...,count a baby's breaths here's how to count the...,count a baby 's breath here 's how to count th...
5,Provide eye care,https://rhlaiservice2.blob.core.windows.net/hm...,To apply eye ointment gently open the baby's e...,provide eye care to apply eye ointment gently ...,provide eye care to apply eye ointment gently ...
6,Provide cord care,https://rhlaiservice2.blob.core.windows.net/hm...,The umbilical cord is an entry point for dange...,provide cord care the umbilical cord is an ent...,provide cord care the umbilical cord be an ent...
7,Apply chlorhexidine,https://rhlaiservice2.blob.core.windows.net/hm...,In areas of the world where newborns are at gr...,apply chlorhexidine in areas of the world wher...,apply chlorhexidine in area of the world where...
8,Give vitamin K,https://rhlaiservice2.blob.core.windows.net/hm...,Wait until the baby has had skin to skin conta...,give vitamin k wait until the baby has had ski...,give vitamin k wait until the baby have have s...
9,Classify the baby,https://rhlaiservice2.blob.core.windows.net/hm...,Classify whether he is well or unwell within 9...,classify the baby classify whether he is well ...,classify the baby classify whether he be well ...


In [7]:
# Save the preprocessed DataFrame to a new CSV file
df.to_csv('preprocessed_data.csv', index=False)