In [2]:
##Load the Data
import pandas as pd

# Example: Load data from a CSV
data = pd.read_csv('text_data.csv')
texts = data['text_column']


In [3]:
#Lowercasing
texts = texts.str.lower()


In [4]:
#Remove punctuation
import string

texts = texts.str.translate(str.maketrans('', '', string.punctuation))


NameError: name 'texts' is not defined

In [None]:
#Tokenization
from nltk.tokenize import word_tokenize

texts = texts.apply(word_tokenize)


In [None]:
#stopword removal

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
texts = texts.apply(lambda x: [word for word in x if word not in stop_words])


In [None]:
#Remove Non-Alphanumeric Characters
texts = texts.apply(lambda x: [word for word in x if word.isalpha()])


In [None]:
#Stemming
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
texts = texts.apply(lambda x: [stemmer.stem(word) for word in x])


In [None]:
#lemmatization
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
texts = texts.apply(lambda x: [lemmatizer.lemmatize(word) for word in x])


In [None]:
#Remove Rare or Frequent Words
from collections import Counter

word_counts = Counter([word for text in texts for word in text])
texts = texts.apply(lambda x: [word for word in x if 5 < word_counts[word] < 1000])



In [None]:
#Spell Correction
from spellchecker import SpellChecker

spell = SpellChecker()
texts = texts.apply(lambda x: [spell.correction(word) for word in x])


In [None]:
#Rejoin Tokens
texts = texts.apply(lambda x: ' '.join(x))


In [None]:
#Save the Cleaned Data
data['cleaned_text'] = texts
data.to_csv('cleaned_text_data.csv', index=False)

In [None]:
#Example Using a Pipeline
def clean_text(texts):
    import string
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize
    from nltk.stem import WordNetLemmatizer

    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    cleaned_texts = []
    for text in texts:
        # Lowercase
        text = text.lower()
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Tokenize
        tokens = word_tokenize(text)
        # Remove stopwords and non-alphabetic words
        tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
        # Lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        # Rejoin tokens
        cleaned_texts.append(' '.join(tokens))
    return cleaned_texts

# Apply function
data['cleaned_text'] = clean_text(data['text_column'])
