This notebook is dedicated fir creating and testing the success of the different modules, different steps of the cleaning function of the data. After all of the different modules will be tested and done, it will than be integrated into one main cleaning function and will be copied to the `2_clean_data.py` file

This script reads a raw CSV file, applies a series of text cleaning steps,
(optional: and saves the cleaned output to a new CSV)

Included cleaning steps:
✔ Expand contractions (e.g., "I'm" → "I am")
✔ Lowercasing
✔ Normalize whitespace
✔ Normalize repeated punctuation (e.g., "!!!" → "!")
✔ Drop empty or invalid rows


Excluded steps:
✘ Spelling correction – Excluded because:
    - May over-correct expressive/emotional writing (e.g., "soooo happy")
    - Adds significant runtime
    - Low value in this domain


In [1]:
# Import packages
import pandas as pd
import re
import contractions
import argparse

In [8]:
# Load the data:
df = pd.read_csv(r"C:\Users\Inbal\projects\emotion_detection\data\train.csv")

print(f"df.shape is {df.shape}") 
df.head()

df.shape is (16000, 2)


Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


# All the modules need to be implemented until combine into main clean_df function

## 1. fix_informal_contractions(text)
Convert variants like im, i m, Im, Dont, don t → standardized forms (I'm, don't, etc.)

In [None]:
# 1. fix_informal_contractions(text: str) -> str
# Convert variants like im, i m, Im, Dont, don t (which exist in this dataset) → standardized forms (I'm, don't, etc.)

def fix_informal_contractions(text: str) -> str:
    """
    Normalize sloppy/missing-apostrophe contractions into their canonical apostrophized forms,
    so that downstream expansion (e.g. with contractions.fix) works reliably.

    Specifically handles patterns like:
      - "im", "i m", "Im", "I m"       → "I'm"
      - "dont", "don t", "Dont", "Don t" → "don't"
      - (and similarly for: didnt, ive, wasnt, doesnt, shouldnt, cant, wont, couldnt, wouldnt)

    We use word-boundary regex (\b) to avoid partial matches, and include both lowercase
    and capitalized variants so we catch every case.
    """
    # Mapping of regex → replacement
    pre_map = {
        # “I’m” variants
        r"\b(i\s?m|im|Im|I\s?m)\b": "I'm",
        # “don't” variants
        r"\b(dont|don\s?t|Dont|Don\s?t)\b": "don't",
        # “didn't”
        r"\b(didnt|did\s?nt|Didnt|Did\s?nt)\b": "didn't",
        # “I've”
        r"\b(ive|i\s?ve|Ive|I\s?ve)\b": "I've",
        # “wasn't”
        r"\b(wasnt|was\s?nt|Wasnt|Was\s?nt)\b": "wasn't",
        # “doesn't”
        r"\b(doesnt|does\s?nt|Doesnt|Does\s?nt)\b": "doesn't",
        # “shouldn't”
        r"\b(shouldnt|should\s?nt|Shouldnt|Should\s?nt)\b": "shouldn't",
        # “can't”
        r"\b(cant|can\s?t|Cant|Can\s?t)\b": "can't",
        # “won't”
        r"\b(wont|won\s?t|Wont|Won\s?t)\b": "won't",
        # “couldn't”
        r"\b(couldnt|could\s?nt|Couldnt|Could\s?nt)\b": "couldn't",
        # “wouldn't”
        r"\b(wouldnt|would\s?nt|Wouldnt|Would\s?nt)\b": "wouldn't",
    }

    # Apply each pattern in turn
    for pattern, replacement in pre_map.items():
        # We don’t set IGNORECASE because we explicitly list capitalized forms
        text = re.sub(pattern, replacement, text)
    return text


In [18]:
# a = "im grabbing a minute to post i feel greedy wrong"
# b = "Im grabbing a minute to post i feel greedy wrong"
# c = "i m grabbing a minute to post i feel greedy wrong"
# d = "I m grabbing a minute to post i feel greedy wrong"
# e = "i\m grabbing a minute to post i feel greedy wrong"
# f = "I\m grabbing a minute to post i feel greedy wrong"

# g = "i?m grabbing a minute to post i feel greedy wrong"
# h = "I?m grabbing a minute to post i feel greedy wrong"

# i = "i   m grabbing a minute to post i feel greedy wrong"
# j = "I   m grabbing a minute to post i feel greedy wrong"


# for z in [a, b, c, d, e, f, g, h, i, j]:
#     print(z)
#     r = fix_informal_contractions(z)
#     print(r)
#     print('\n\n')    

## 2. expand_contractions(text: str) -> str
Use contractions.fix() to turn I'm → I am, don't → do not, etc.


## 3. lowercase_text(text: str) -> str
Normalize all characters to lowercase.

## 4. normalize_whitespace(text: str) -> str
Collapse runs of whitespace (\s+) to a single space and trim leading/trailing spaces.

## 5. normalize_punctuation(text: str) -> str
Replace repeated punctuation sequences (!!!, ..., ???) with a single character (!, ., ?).

## 6. clean_text(text: str) -> str
(Optional helper) Wraps steps 1–5 in one function for single-string cleaning.


## 7. drop_invalid_rows(df: pd.DataFrame, col: str = "text") -> pd.DataFrame
Remove any rows where df[col] is null or empty after cleaning.

## Finaly:
clean_df(df: pd.DataFrame, col: str = "text") -> pd.DataFrame
Master pipeline that:

* Ensures col exists and is string

* Applies clean_text (or steps 1–5) to df[col]

* Calls drop_invalid_rows

* Returns the cleaned DataFrame

## Output comparison between raw and clean DF

In [None]:
# def print_examples(df, emotion, n_samples):
#     samples = df[df.emotion == emotion].sample(n_samples, random_state=42)
#     for idx, row in samples.iterrows():
#         print(f"# {row.text}")
#     print("\n")


# def output_comparison(raw_df, clean_df):
#     emotions = sorted(raw_df.emotion.unique())

#     for emotion in emotions:
#         print(f"--- {emotion.upper()} ---")

#         print("Raw Examples:")
#         print_examples(raw_df, emotion, 3)

#         print("Clean Examples:")
#         print_examples(clean_df, emotion, 3)
        
#         print()