In [None]:
pip install pandas nltk

In [5]:
import re
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\huzai\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\huzai\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\huzai\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [6]:
data = {
    "text": [
        "The cat is loafing on the mat!",
        "Dogs are sitting on the mat.",
        "The cat and dog are friends."
    ],
    "label": ["animal", "animal", "relationship"]
}

df = pd.DataFrame(data)

**text cleaning**

In [8]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    return text.strip()

df["cleaned_text"] = df["text"].apply(clean_text)

**Stopword Removal**

In [9]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([w for w in text.split() if w not in stop_words])

df["no_stopwords"] = df["cleaned_text"].apply(remove_stopwords)

**Lemmatization**

In [10]:
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    return " ".join([lemmatizer.lemmatize(w) for w in text.split()])

df["lemmatized_text"] = df["no_stopwords"].apply(lemmatize_text)

**Label Encoding**

In [11]:
encoder = LabelEncoder()
df["encoded_label"] = encoder.fit_transform(df["label"])

**TF-IDF Representation**

In [12]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["lemmatized_text"])

tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=tfidf.get_feature_names_out()
)

In [13]:
df.to_csv("processed_text_data.csv", index=False)
tfidf_df.to_csv("tfidf_output.csv", index=False)
df[["encoded_label"]].to_csv("labels.csv", index=False)