<a href="https://colab.research.google.com/github/SahilGhg/Social-Media-Analytics/blob/main/reddit_data_cleaner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wordsegment language-tool-python

Collecting wordsegment
  Downloading wordsegment-1.3.1-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting language-tool-python
  Downloading language_tool_python-2.9.4-py3-none-any.whl.metadata (55 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.5/55.5 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Downloading wordsegment-1.3.1-py2.py3-none-any.whl (4.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.8/4.8 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading language_tool_python-2.9.4-py3-none-any.whl (55 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m55.6/55.6 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: wordsegment, language-tool-python
Successfully installed language-tool-python-2.9.4 wordsegment-1.3.1


In [10]:
import pandas as pd
import re
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import wordsegment
from textblob import TextBlob
import language_tool_python

import nltk
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

wordsegment.load()

# lang_tool = language_tool_python.LanguageTool('en-US') # Uncomment for grammar correction

def clean_social_media_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    A single function to clean a DataFrame containing social media text data.
    """
    print("--- Starting Data Cleaning Process ---")

    clean_df = df.copy()

    # Step 0: Filter out deleted/removed entries
    original_rows = len(clean_df)
    # The `isin()` method checks for exact matches to '[deleted]' or '[removed]'
    # The `~` symbol inverts the selection, keeping all rows that DO NOT match.
    clean_df = clean_df[~clean_df['text'].isin(['[deleted]', '[removed]'])]
    rows_removed = original_rows - len(clean_df)
    if rows_removed > 0:
        print(f"Step 0: Removed {rows_removed} deleted/removed entries.")

    # Step 1: Duplicate Removal
    clean_df.drop_duplicates(inplace=True)
    print("Step 1: Duplicates removed.")

    text_column = clean_df['text']

    # Step 2: Convert to Lowercase
    text_column = text_column.str.lower()
    print("Step 2: Converted text to lowercase.")

    # Step 3: Remove URLs
    text_column = text_column.apply(lambda x: re.sub(r'https?://\S+|www\.\S+', '', x))
    print("Step 3: URLs removed.")

    # Step 4: Remove HTML tags
    text_column = text_column.apply(lambda x: BeautifulSoup(x, "html.parser").get_text())
    print("Step 4: HTML tags removed.")

    # Step 5: Basic Cleaning (Remove mentions, hashtags, and special characters)
    text_column = text_column.apply(lambda x: re.sub(r'@[A-Za-z0-9_]+|#[A-Za-z0-9_]+', '', x))
    text_column = text_column.apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))
    print("Step 5: Mentions, hashtags, and special chars removed.")

    # Step 6: Remove Multiple Letters (e.g., 'sooo' -> 'so')
    text_column = text_column.apply(lambda x: re.sub(r'(.)\1{2,}', r'\1\1', x))
    print("Step 6: Elongated words shortened")

    # Step 7: Whitespace Removal
    text_column = text_column.apply(lambda x: x.strip())
    text_column = text_column.apply(lambda x: re.sub(r'\s+', ' ', x))
    print("Step 7: Extra whitespace removed.")

    # Step 8: Split Attached Words (e.g., 'goodservice' -> 'good service')
    # text_column = text_column.apply(lambda x: ' '.join(wordsegment.segment(x)))
    # print("Step 8: Attached words split.")

    # # Step 9: Spelling Correction
    # text_column = text_column.apply(lambda x: str(TextBlob(x).correct()))
    # print("Step 9: Spelling correction applied.")

    # Step 10: Grammar Correction
    # text_column = text_column.apply(lambda x: lang_tool.correct(x))
    # print("Step 10: Grammar correction applied.")

    # Step 11: Tokenization
    text_column = text_column.apply(word_tokenize)
    print("Step 11: Text tokenized.")

    # Step 12: Remove Stopwords
    stop_words = set(stopwords.words('english'))
    custom_stopwords = ['zomato', 'title', 'body']
    stop_words.update(custom_stopwords)
    text_column = text_column.apply(lambda tokens: [word for word in tokens if word not in stop_words])
    print("Step 12: Stopwords removed.")

    # Step 13: Lemmatization
    lemmatizer = WordNetLemmatizer()
    text_column = text_column.apply(lambda tokens: [lemmatizer.lemmatize(word) for word in tokens])
    print("Step 13: Words lemmatized.")

    # Assign the cleaned text back to the DataFrame
    clean_df['cleaned_text_tokens'] = text_column

    print("--- Cleaning Process Finished ---")
    return clean_df

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


**1. LOAD**

Load the raw data you collected in Experiment 2

In [11]:
raw_data_path = 'reddit_rawdata.csv'
raw_df = pd.read_csv(raw_data_path)
print("Raw data loaded successfully.")
print(f"Shape of raw data: {raw_df.shape}")

Raw data loaded successfully.
Shape of raw data: (1232, 11)


**2. CLEAN**

In [12]:
cleaned_df = clean_social_media_data(raw_df)
print("\nData has been cleaned.")

--- Starting Data Cleaning Process ---
Step 0: Removed 44 deleted/removed entries.
Step 1: Duplicates removed.
Step 2: Converted text to lowercase.
Step 3: URLs removed.
Step 4: HTML tags removed.
Step 5: Mentions, hashtags, and special chars removed.
Step 6: Elongated words shortened
Step 7: Extra whitespace removed.
Step 11: Text tokenized.
Step 12: Stopwords removed.
Step 13: Words lemmatized.
--- Cleaning Process Finished ---

Data has been cleaned.


**3. SAVE**

In [13]:
cleaned_data_path = 'reddit_data_cleaned_2.csv'
cleaned_df.to_csv(cleaned_data_path, index=False)
print(f"Cleaned data saved to '{cleaned_data_path}'")

Cleaned data saved to 'reddit_data_cleaned_2.csv'
