In [1]:
import pandas as pd
import re

# Step 1: Load the file
file = 'iPhone 14, 15, 16 reviews.csv'
df = pd.read_csv(file)

In [5]:
# Step 2: Convert the text to lower case
df["lowercased"] = df["review"].astype(str).str.lower()

In [9]:
# Step 3: Remove slangs from the text
slang_dict = {
    "tbh": "to be honest",
    "omg": "oh my god",
    "lol": "laugh out loud",
    "idk": "I don't know",
    "brb": "be right back",
    "btw": "by the way",
    "imo": "in my opinion",
    "smh": "shaking my head",
    "fyi": "for your information",
    "np": "no problem",
    "ikr": "I know right",
    "asap": "as soon as possible",
    "bff": "best friend forever",
    "gg": "good game",
    "hmu": "hit me up",
    "rofl": "rolling on the floor laughing",
    "w": "win",
    "im": "i'm"
}

def replace_slang(text):
    escaped_slang_words = []
    for word in slang_dict.keys():
        escaped_word = re.escape(word)
        escaped_slang_words.append(escaped_word)
    slang_pattern = r'\b(' + '|'.join(escaped_slang_words) + r')\b'
    
    def replace_match(match):
        slang_word = match.group(0)
        return slang_dict[slang_word.lower()]
    replaced_text = re.sub(slang_pattern, replace_match, text, flags=re.IGNORECASE)
    return replaced_text

df['slangs_replaced'] = df['lowercased'].apply(replace_slang)

In [11]:
# Step 4: Replace contrations in the text with full clause
contractions_dict = {
    "wasn't": "was not",
    "isn't": "is not",
    "aren't": "are not",
    "weren't": "were not",
    "doesn't": "does not",
    "don't": "do not",
    "didn't": "did not",
    "can't": "cannot",
    "couldn't": "could not",
    "shouldn't": "should not",
    "wouldn't": "would not",
    "won't": "will not",
    "haven't": "have not",
    "hasn't": "has not",
    "hadn't": "had not",
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "it's": "it is",
    "we're": "we are",
    "they're": "they are",
    "i've": "i have",
    "you've": "you have",
    "we've": "we have",
    "they've": "they have",
    "i'd": "i would",
    "you'd": "you would",
    "he'd": "he would",
    "she'd": "she would",
    "we'd": "we would",
    "they'd": "they would",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "we'll": "we will",
    "they'll": "they will",
    "let's": "let us",
    "that's": "that is",
    "who's": "who is",
    "what's": "what is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is"
}

escaped_contractions = []

for contraction in contractions_dict.keys():
    escaped_contraction = re.escape(contraction)
    escaped_contractions.append(escaped_contraction)

joined_contractions = "|".join(escaped_contractions)
contractions_pattern = r'\b(' + joined_contractions + r')\b'
compiled_pattern = re.compile(contractions_pattern, flags=re.IGNORECASE)

def replaced_contractions(text):
    def replace_match(match):
        matched_word = match.group(0)  # Extract matched contraction
        lower_matched_word = matched_word.lower()  # Convert to lowercase
        expanded_form = contractions_dict[lower_matched_word]  # Get full form from dictionary
        return expanded_form  # Return the expanded form

    expanded_text = compiled_pattern.sub(replace_match, text)

    return expanded_text

df['contractions_replaced'] = df['slangs_replaced'].apply(replaced_contractions)

In [13]:
# Step 5: Remove punctuation from text
import string
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

df["punctuations_removed"] = df["contractions_replaced"].apply(remove_punctuation)

# Step 6: Remove numbers form the text
def remove_numbers(text):
    return re.sub(r'\d+', '', text)

df['numbers_removed'] = df['punctuations_removed'].apply(remove_numbers)

In [15]:
# Step 7: Perform autocorrect on the text
from autocorrect import Speller

spell = Speller(lang='en')

def correct_spelling(text):
    return spell(text)

df['spelling_corrected'] = df['numbers_removed'].apply(correct_spelling)


In [16]:
# Step 8: Remove emojis from the text
import emoji
def remove_emojis(text):
    return emoji.replace_emoji(text, replace='')

df['emoji_removed'] = df['spelling_corrected'].apply(remove_emojis)

# Step 9: Remove stopwords from the text
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
    words = text.split()
    filtered_words = []
    for word in words:
        lower_word = word.lower()
        if lower_word not in stop_words:
            filtered_words.append(word)
    return " ".join(filtered_words)

df["stopwords_removed"] = df["emoji_removed"].apply(remove_stopwords)

# Step 10: Stem all the text
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    if not isinstance(text, str):
        return ""
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    return " ".join(stemmed_words)

df["stemmed_words"] = df["stopwords_removed"].apply(stem_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
# Step 11: Lemmatize the text
import nltk

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('punkt_tab')

from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk import pos_tag

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(nltk_tag):
    if nltk_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_tag.startswith('N'):
        return wordnet.NOUN
    elif nltk_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize_text(text):
    if not isinstance(text, str):
        return ""

    words = word_tokenize(text)
    pos_tags = pos_tag(words)

    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words)

df["lemmatized"] = df["stopwords_removed"].apply(lemmatize_text)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [18]:
# Step 12: Tokenize all the text
from nltk.tokenize import word_tokenize
nltk.download('punkt')

def tokenize_text(text):
    if not isinstance(text, str):
        return []
    return word_tokenize(text)

df["tokenized"] = df["lemmatized"].apply(tokenize_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\thema\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [23]:
df

Unnamed: 0,profile_name,rating,rating_date,title,review,lowercased,slangs_replaced,contractions_replaced,punctuations_removed,numbers_removed,spelling_corrected,emoji_removed,stopwords_removed,stemmed_words,lemmatized,tokenized
0,John Doe,5.0 out of 5 stars,"Reviewed in the United States on February 26, ...","5.0 out of 5 stars\nGreat quality, great price...",Really fast shipping option was great because ...,really fast shipping option was great because ...,really fast shipping option was great because ...,really fast shipping option was great because ...,really fast shipping option was great because ...,really fast shipping option was great because ...,really fast shipping option was great because ...,really fast shipping option was great because ...,really fast shipping option great broke phone ...,realli fast ship option great broke phone phon...,really fast shipping option great break phone ...,"[really, fast, shipping, option, great, break,..."
1,Pamela Orr,5.0 out of 5 stars,"Reviewed in the United States on April 4, 2025",5.0 out of 5 stars\nPhone works like new.,I can hardly believe this phone is refurbished...,i can hardly believe this phone is refurbished...,i can hardly believe this phone is refurbished...,i can hardly believe this phone is refurbished...,i can hardly believe this phone is refurbished...,i can hardly believe this phone is refurbished...,i can hardly believe this phone is refurbished...,i can hardly believe this phone is refurbished...,hardly believe phone refurbished seems brand n...,hardli believ phone refurbish seem brand new b...,hardly believe phone refurbish seem brand new ...,"[hardly, believe, phone, refurbish, seem, bran..."
2,Another Amazon Reviewer,1.0 out of 5 stars,"Reviewed in the United States on September 19,...",1.0 out of 5 stars\n1st purchase was great! Th...,I have had this iPhone for 38 days.It has the ...,i have had this iphone for 38 days.it has the ...,i have had this iphone for 38 days.it has the ...,i have had this iphone for 38 days.it has the ...,i have had this iphone for 38 daysit has the w...,i have had this iphone for daysit has the wor...,i have had this iphone for days has the worst...,i have had this iphone for days has the worst...,iphone days worst battery life iphone ever own...,iphon day worst batteri life iphon ever own ow...,iphone day worst battery life iphone ever own ...,"[iphone, day, worst, battery, life, iphone, ev..."
3,zavi,5.0 out of 5 stars,"Reviewed in the United States on March 23, 2025",5.0 out of 5 stars\n5 STARS!!!!!!!! NO COMPLAI...,I ordered this phone in excellent condition an...,i ordered this phone in excellent condition an...,i ordered this phone in excellent condition an...,i ordered this phone in excellent condition an...,i ordered this phone in excellent condition an...,i ordered this phone in excellent condition an...,i ordered this phone in excellent condition an...,i ordered this phone in excellent condition an...,ordered phone excellent condition excellent li...,order phone excel condit excel like describ be...,order phone excellent condition excellent like...,"[order, phone, excellent, condition, excellent..."
4,Ari,4.0 out of 5 stars,"Reviewed in the United States on February 26, ...",4.0 out of 5 stars\nGreat!,Good so far. Phones battery healt is at 86%. P...,good so far. phones battery healt is at 86%. p...,good so far. phones battery healt is at 86%. p...,good so far. phones battery healt is at 86%. p...,good so far phones battery healt is at 86 phon...,good so far phones battery healt is at phone ...,good so far phones battery health is at phone...,good so far phones battery health is at phone...,good far phones battery health phone condition...,good far phone batteri health phone condit goo...,good far phone battery health phone condition ...,"[good, far, phone, battery, health, phone, con..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1019,Tori Fenton,5.0 out of 5 stars,"Reviewed in the United States on March 5, 2025",5.0 out of 5 stars\nVibration wake-up call,The vibration feature of my cell phone alarm c...,the vibration feature of my cell phone alarm c...,the vibration feature of my cell phone alarm c...,the vibration feature of my cell phone alarm c...,the vibration feature of my cell phone alarm c...,the vibration feature of my cell phone alarm c...,the vibration feature of my cell phone alarm c...,the vibration feature of my cell phone alarm c...,vibration feature cell phone alarm clock allow...,vibrat featur cell phone alarm clock allow wom...,vibration feature cell phone alarm clock allow...,"[vibration, feature, cell, phone, alarm, clock..."
1020,Malisa Garcia,5.0 out of 5 stars,"Reviewed in the United States on March 3, 2025",5.0 out of 5 stars\nGreat phone.,The incoming call reminder and call logging fe...,the incoming call reminder and call logging fe...,the incoming call reminder and call logging fe...,the incoming call reminder and call logging fe...,the incoming call reminder and call logging fe...,the incoming call reminder and call logging fe...,the incoming call reminder and call logging fe...,the incoming call reminder and call logging fe...,incoming call reminder call logging features c...,incom call remind call log featur cellular pho...,incoming call reminder call logging feature ce...,"[incoming, call, reminder, call, logging, feat..."
1021,Tashee Jordan,5.0 out of 5 stars,"Reviewed in the United States on February 12, ...",5.0 out of 5 stars\nHighly recommend,The new phone has much more storage space. The...,the new phone has much more storage space. the...,the new phone has much more storage space. the...,the new phone has much more storage space. the...,the new phone has much more storage space the ...,the new phone has much more storage space the ...,the new phone has much more storage space the ...,the new phone has much more storage space the ...,new phone much storage space camera takes grea...,new phone much storag space camera take great ...,new phone much storage space camera take great...,"[new, phone, much, storage, space, camera, tak..."
1022,carol gossman,5.0 out of 5 stars,"Reviewed in the United States on January 11, 2025",5.0 out of 5 stars\nhigh signal,"The phone has a strong signal, the system runs...","the phone has a strong signal, the system runs...","the phone has a strong signal, the system runs...","the phone has a strong signal, the system runs...",the phone has a strong signal the system runs ...,the phone has a strong signal the system runs ...,the phone has a strong signal the system runs ...,the phone has a strong signal the system runs ...,phone strong signal system runs smoothly impor...,phone strong signal system run smoothli import...,phone strong signal system run smoothly import...,"[phone, strong, signal, system, run, smoothly,..."


In [25]:
# Finally: Save the data to CSV file 
df.to_csv("cleaned text data for iPhone 14, 15, and 16.csv", index=False)
print("Successfully cleaned the text data and saved into a CSV file!!")

Successfully cleaned the text data and saved into a CSV file!!
