<a href="https://colab.research.google.com/github/karolinakuligowska/TMSMM_codes/blob/main/TM_Large_files_600k_rows_processing_with_Polars.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#
# Solution with polars
#

In [2]:
!pip install polars spacy nltk



In [3]:
import polars as pl
import spacy
import re
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords, wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer


In [4]:
# Ensure required NLTK resources are downloaded
nltk.download("stopwords")
nltk.download("wordnet")
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.


True

In [5]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [6]:
# File paths <= change according to your settings
CSV_FILE_PATH = "my_large_file.csv"                  # initial name of the large file
PROCESSED_FILE_PATH = "my_large_file__processed.csv" # give any name that you want

In [7]:
# FIRST OPTIONAL CHECK ON 1k rows
# Load only the first 1000 rows
df = pl.read_csv(CSV_FILE_PATH).head(1000)

# Save to a new CSV file
df.write_csv("first_1k_rows.csv")

print("First 1000 rows saved successfully.")


First 1000 rows saved successfully.


In [9]:
### run the subsequent code first with 1k rows,
### and if the effect is OK,
### then go back here and change this line:
CSV_FILE_PATH = "first_1k_rows.csv"
# to this line: CSV_FILE_PATH = "my_large_file.csv"

In [14]:
# Load spaCy model for fast tokenization
nlp = spacy.load("en_core_web_sm")  # Make sure to install: `python -m spacy download en_core_web_sm`

# Initialize Porter Stemmer & Lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Load stopwords list
stop_words = set(stopwords.words("english"))

# Load CSV efficiently
df = pl.read_csv(CSV_FILE_PATH)

In [15]:
def get_wordnet_pos(word):
    """Map POS tag to first character WordNet Lemmatizer accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N": wordnet.NOUN, "V": wordnet.VERB, "R": wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)  # Default to noun

def stem_completion(stemmed_tokens):
    """Completes stemmed words using lemmatization."""
    return [lemmatizer.lemmatize(word, get_wordnet_pos(word)) for word in stemmed_tokens]

def clean_text(text: str) -> str:
    """Performs text preprocessing: removes HTML, special characters, lowercasing, stopwords, stemming, and stem completion."""
    if text is None or text.strip() == "":
        return "missing_text"  # Handle empty values safely

    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Remove special characters, numbers, and punctuation
    text = re.sub(r"[^a-zA-Z\s]", "", text)

    # Convert to lowercase
    text = text.lower()

    # Tokenization using spaCy
    tokens = [token.text for token in nlp(text) if token.text.strip()]

    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming (reducing words to their root form)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]

    # Stem Completion** (Convert stemmed words back to meaningful words)
    completed_tokens = stem_completion(stemmed_tokens)

    # Return cleaned and processed text
    return " ".join(completed_tokens)

# Apply text cleaning using Polars .map_elements()
df = df.with_columns(
    pl.col("Sentence").map_elements(clean_text).alias("Processed_Sentence") # "Sentence" is the text column name in CSV_FILE_PATH, you can change it here
)

# Save processed CSV
df.write_csv(PROCESSED_FILE_PATH)

print(f"Processing complete. Saved to {PROCESSED_FILE_PATH}")







Processing complete. Saved to my_large_file__processed.csv
