In [1]:
# Load dataset
import pandas as pd

file_path = "IpReviews.csv"  # Updated file name
df = pd.read_csv(file_path)

# Display the 'review' column content without truncation
pd.set_option('display.max_colwidth', None)  # Set to None for unlimited width
print(df["review"])  # Only print the 'review' column

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [3]:
# Replace internet slang/chat words
import re

# Expanded slang dictionary (added common tech/gaming terms)
slang_dict = {
    # Original entries
    "tbh": "to be honest", "omg": "oh my god", "lol": "laugh out loud",
    "idk": "I don't know", "brb": "be right back", "btw": "by the way",
    "imo": "in my opinion", "smh": "shaking my head", "fyi": "for your information",
    "np": "no problem", "ikr": "I know right", "asap": "as soon as possible",
    "bff": "best friend forever", "gg": "good game", "hmu": "hit me up",
    "rofl": "rolling on the floor laughing",
    "afaik": "as far as I know", "rn": "right now", "tbf": "to be fair",
    "wtf": "what the heck", "nvm": "never mind", "gl": "good luck",
    "ty": "thank you", "thx": "thanks", "pls": "please",
    "imo": "in my opinion", "ily": "I love you", "srsly": "seriously",
    "tmi": "too much information", "irl": "in real life",
    "dm": "direct message", "pw": "password", "unbox": "unboxing"
}

def replace_slang(text):
    if not isinstance(text, str):
        return text  # Skip non-string values
    
    escaped_slang_words = [re.escape(word) for word in slang_dict.keys()]
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'
    
    def replace_match(match):
        slang_word = match.group(0).lower()
        return slang_dict.get(slang_word, slang_word)  # Fallback to original if key missing
    
    return re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)

# Apply to the 'review' column (not 'emojis_removed')
df["slangs_replaced"] = df["review"].apply(replace_slang)

# Display results
pd.set_option('display.max_colwidth', None)
print(df[["review", "slangs_replaced"]].head())  # Compare original vs processed

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [5]:
# Replace Contractions - Enhanced Version
import re

# Expanded contractions dictionary (added tech/e-commerce specific terms)
contractions_dict = {
    # Standard English contractions
    "wasn't": "was not", "isn't": "is not", "aren't": "are not",
    "weren't": "were not", "doesn't": "does not", "don't": "do not",
    "didn't": "did not", "can't": "cannot", "couldn't": "could not",
    "shouldn't": "should not", "wouldn't": "would not", "won't": "will not",
    "haven't": "have not", "hasn't": "has not", "hadn't": "had not",
    "i'm": "i am", "you're": "you are", "he's": "he is", "she's": "she is",
    "it's": "it is", "we're": "we are", "they're": "they are",
    "i've": "i have", "you've": "you have", "we've": "we have", "they've": "they have",
    "i'd": "i would", "you'd": "you would", "he'd": "he would", "she'd": "she would",
    "we'd": "we would", "they'd": "they would", "i'll": "i will", "you'll": "you will",
    "he'll": "he will", "she'll": "she will", "we'll": "we will", "they'll": "they will",
    "let's": "let us", "that's": "that is", "who's": "who is", "what's": "what is",
    "where's": "where is", "when's": "when is", "why's": "why is",
    
    # Added tech/e-commerce specific contractions
    "there's": "there is", "here's": "here is", "how's": "how is",
    "it'll": "it will", "that'll": "that will", "there'll": "there will",
    "this's": "this is", "something's": "something is", "everything's": "everything is",
    "nothing's": "nothing is", "someone's": "someone is", "everyone's": "everyone is",
    "phone's": "phone is", "battery's": "battery is", "device's": "device is",
    "screen's": "screen is", "camera's": "camera is", "order's": "order is",
    
    # Informal spoken contractions
    "gonna": "going to", "wanna": "want to", "gotta": "got to",
    "kinda": "kind of", "sorta": "sort of", "outta": "out of",
    "lemme": "let me", "dunno": "do not know", "cos": "because"
}

def replace_contractions(text):
    if not isinstance(text, str):
        return text  # Skip non-string values
    
    # Build regex pattern
    escaped = [re.escape(c) for c in contractions_dict.keys()]
    pattern = re.compile(r'\b(' + '|'.join(escaped) + r')\b', flags=re.IGNORECASE)
    
    # Replacement function
    def replacer(match):
        matched = match.group(0).lower()
        return contractions_dict.get(matched, matched)  # Fallback to original if key missing
    
    return pattern.sub(replacer, text)

# Apply to the 'slangs_replaced' column
df["contractions_replaced"] = df["slangs_replaced"].apply(replace_contractions)

# Display comparison
pd.set_option('display.max_colwidth', None)
print(df[["slangs_replaced", "contractions_replaced"]].head(3))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [7]:
# Remove Punctuation - Enhanced Version
import string

def remove_punctuation(text):
    if not isinstance(text, str):
        return text  # Skip non-string values
    
    # Custom punctuation rules
    translator = str.maketrans(
        '', '',
        string.punctuation.replace("'", "")  # Keep apostrophes for possessives
    )
    
    # First pass: Remove standard punctuation
    text = text.translate(translator)
    
    # Second pass: Handle special cases
    text = re.sub(r'[“”‘’]', '', text)  # Remove curly quotes
    text = re.sub(r'\s{2,}', ' ', text)  # Fix extra spaces
    return text.strip()

# Apply to the 'contractions_replaced' column
df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)

# Display comparison
pd.set_option('display.max_colwidth', None)
print(df[["contractions_replaced", "punctuations_removed"]].head(3))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [9]:
# Remove Numbers 
import re

def remove_numbers(text):
    if not isinstance(text, str):
        return text  # Skip non-string values
    
    # Pattern to remove standalone numbers but preserve:
    # - Version numbers (e.g., iOS 16)
    # - Percentages (e.g., 86%)
    # - Model numbers (e.g., iPhone 14)
    pattern = r'(?<!\w)\d+(?!\w|%)'
    return re.sub(pattern, '', text)

# Apply to the 'punctuations_removed' column
df["numbers_removed"] = df["punctuations_removed"].apply(remove_numbers)

# Display comparison
pd.set_option('display.max_colwidth', None)
print(df[["punctuations_removed", "numbers_removed"]].head(3))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [13]:
# Remove numbers
def remove_numbers(text):
 return re.sub(r'\d+', '', text) # Removes all numeric characters
# Apply the function to the column
df["numbers_removed"] = df["punctuations_removed"].apply(remove_numbers)
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df["numbers_removed"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [14]:
# Correct spelling mistakes
from autocorrect import Speller
# Initialize spell checker
spell = Speller(lang='en')
# Function to correct spelling
def correct_spelling(text):
 return spell(text) # Apply correction
# Apply the function to the column
df["spelling_corrected"] = df["numbers_removed"].apply(correct_spelling)
# Display column content without truncation
pd.set_option('display.max_colwidth', None) # Set to None for unlimited width
print(df["spelling_corrected"])

0                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       

In [17]:
# Remove Stopwords - Enhanced Version
import nltk
from nltk.corpus import stopwords

# Download stopwords (only if not already downloaded)
nltk.download('stopwords', quiet=True)

# Custom stopwords list - preserving important tech-related terms
base_stopwords = set(stopwords.words('english'))

# Remove these words from standard stopwords (important for product reviews)
preserved_words = {
    'no', 'not', 'nor', 'only', 'very', 'too', 'more', 'most', 
    'over', 'under', 'against', 'between', 'above', 'below'
}

# Add tech-specific stopwords to remove
extra_tech_stopwords = {
    'phone', 'item', 'product', 'device', 'review', 'purchase'
}

# Final stopwords set
stop_words = (base_stopwords - preserved_words) | extra_tech_stopwords

def remove_stopwords(text):
    if not isinstance(text, str):
        return text
    
    words = text.split()
    filtered_words = [
        word for word in words 
        if word.lower() not in stop_words and len(word) > 1
    ]
    return " ".join(filtered_words)

# Apply to the 'spelling_corrected' column
df["stopwords_removed"] = df["spelling_corrected"].apply(remove_stopwords)

# Display comparison
pd.set_option('display.max_colwidth', None)
print(df[["spelling_corrected", "stopwords_removed"]].head(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [19]:
# Stemming - Enhanced Version
from nltk.stem import PorterStemmer

# Initialize stemmer with tech-term exceptions
stemmer = PorterStemmer()

# Dictionary of tech terms to preserve (avoid over-stemming)
tech_exceptions = {
    'iphone': 'iphone',
    'ios': 'ios',
    'android': 'android',
    'camera': 'camera',
    'battery': 'battery',
    'display': 'display',
    'touchid': 'touchid',
    'faceid': 'faceid'
}

def stem_text(text):
    if not isinstance(text, str):
        return ""
    
    words = text.split()
    stemmed_words = []
    
    for word in words:
        lower_word = word.lower()
        if lower_word in tech_exceptions:
            stemmed_words.append(tech_exceptions[lower_word])  # Preserve exact tech terms
        else:
            stemmed = stemmer.stem(word)
            # Only keep stem if it doesn't create nonsense (min 3 chars)
            if len(stemmed) >= 3 or len(word) < 4:
                stemmed_words.append(stemmed)
            else:
                stemmed_words.append(word)
    
    return " ".join(stemmed_words)

# Apply to the 'stopwords_removed' column
df["stemmed_words"] = df["stopwords_removed"].apply(stem_text)

# Display comparison
pd.set_option('display.max_colwidth', None)
print(df[["stopwords_removed", "stemmed_words"]].head(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [21]:
# Lemmatization - Enhanced Version
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

# Download resources (only if not already present)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt', quiet=True)

# Initialize lemmatizer with tech term protection
lemmatizer = WordNetLemmatizer()

# Dictionary of tech terms to preserve exactly
tech_lemmatization_exceptions = {
    'iphone', 'ios', 'ipad', 'macbook', 'airpods', 'applewatch',
    'touchid', 'faceid', 'airdrop', 'appstore', 'icloud', 'siri',
    'retina', 'amoled', '5g', 'lte', 'wifi', 'bluetooth'
}

def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'): return wordnet.ADJ
    elif nltk_tag.startswith('V'): return wordnet.VERB
    elif nltk_tag.startswith('N'): return wordnet.NOUN
    elif nltk_tag.startswith('R'): return wordnet.ADV
    else: return wordnet.NOUN

def lemmatize_text(text):
    if not isinstance(text, str):
        return ""
    
    # Tokenize with case preservation
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    
    lemmatized_words = []
    for word, tag in pos_tags:
        lower_word = word.lower()
        if lower_word in tech_lemmatization_exceptions:
            lemmatized_words.append(word)  # Preserve original casing for tech terms
        else:
            lemma = lemmatizer.lemmatize(word, get_wordnet_pos(tag))
            # Only keep if lemma makes sense
            if len(lemma) >= 3 or len(word) < 4:
                lemmatized_words.append(lemma)
            else:
                lemmatized_words.append(word)
    
    return " ".join(lemmatized_words)

# Apply to 'stopwords_removed' column (not stemmed_words to avoid over-processing)
df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)

# Display comparison
pd.set_option('display.max_colwidth', None)
print(df[["stopwords_removed", "lemmatized"]].head(5))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [23]:
# Save to CSV with ALL columns (original + processed)
df.to_csv("Processed_IpReviews.csv", index=False)

# Verify the saved file contains all columns
print(f"File saved with columns: {df.columns.tolist()}")

File saved with columns: ['profile_name', 'rating', 'rating_date', 'title', 'review', 'slangs_replaced', 'contractions_replaced', 'punctuations_removed', 'numbers_removed', 'spelling_corrected', 'stopwords_removed', 'stemmed_words', 'lemmatized']


In [29]:
# Add emoji removal step (insert this RIGHT AFTER lowercase conversion)
import emoji
import re

def remove_emojis(text):
    if not isinstance(text, str):
        return text
    # Remove all emojis and emoji variants
    return emoji.replace_emoji(text, replace='')

# Apply after lowercase conversion (add this line)
df["emoji_removed"] = df["lowercased"].apply(remove_emojis)

# THEN UPDATE ALL SUBSEQUENT STEPS to use "emoji_removed" instead of "lowercased"
# Example:
df["slangs_replaced"] = df["emoji_removed"].apply(replace_slang)  # Instead of df["lowercased"]

KeyError: 'lowercased'