In [1]:
import pandas as pd
import numpy as np
import string
import re
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from autocorrect import Speller
from contractions import fix

# Load the dataset
df = pd.read_csv("UNITENReview.csv")

# Step 1: Check for missing values
df["Review"].dropna(inplace=True)

# Step 2: Convert to lowercase
df["Review"] = df["Review"].str.lower()

# Step 3: Remove punctuation and special characters
df["Review"] = df["Review"].apply(lambda text: text.translate(str.maketrans("", "", string.punctuation)))

# Step 4: Remove numbers
df["Review"] = df["Review"].apply(lambda text: re.sub(r'\d+', '', text))

# Step 5: Expand contractions (e.g., "isn't" → "is not")
df["Review"] = df["Review"].apply(lambda text: fix(text))

# Step 6: Remove stopwords
stop_words = set(stopwords.words("english"))
df["Review"] = df["Review"].apply(lambda text: " ".join([word for word in word_tokenize(text) if word not in stop_words]))

# Step 7: Correct spelling mistakes
spell = Speller(lang='en')
df["Review"] = df["Review"].apply(lambda text: " ".join([spell(word) for word in text.split()]))

# Step 8: Lemmatization
lemmatizer = WordNetLemmatizer()

# Function to get wordnet POS tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemmatization function
def lemmatize_text(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words)

df["Review"] = df["Review"].apply(lemmatize_text)

# Save the cleaned data to a new CSV file
df.to_csv("UNITENReview_Cleaned.csv", index=False)

print("Text pre-processing completed. Cleaned file saved as 'UNITENReview_Cleaned.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


ModuleNotFoundError: No module named 'contractions'

In [2]:
pip install contractions


Collecting contractions
  Downloading contractions-0.1.73-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting textsearch>=0.0.21 (from contractions)
  Downloading textsearch-0.0.24-py2.py3-none-any.whl.metadata (1.2 kB)
Collecting anyascii (from textsearch>=0.0.21->contractions)
  Downloading anyascii-0.3.2-py3-none-any.whl.metadata (1.5 kB)
Collecting pyahocorasick (from textsearch>=0.0.21->contractions)
  Downloading pyahocorasick-2.1.0-cp312-cp312-win_amd64.whl.metadata (13 kB)
Downloading contractions-0.1.73-py2.py3-none-any.whl (8.7 kB)
Downloading textsearch-0.0.24-py2.py3-none-any.whl (7.6 kB)
Downloading anyascii-0.3.2-py3-none-any.whl (289 kB)
Downloading pyahocorasick-2.1.0-cp312-cp312-win_amd64.whl (39 kB)
Installing collected packages: pyahocorasick, anyascii, textsearch, contractions
Successfully installed anyascii-0.3.2 contractions-0.1.73 pyahocorasick-2.1.0 textsearch-0.0.24
Note: you may need to restart the kernel to use updated packages.


In [3]:
!pip install contractions




In [4]:
import pandas as pd
import numpy as np
import string
import re
import nltk

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')

from nltk.corpus import stopwords, wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from autocorrect import Speller
from contractions import fix

# Load the dataset
df = pd.read_csv("UNITENReview.csv")

# Step 1: Check for missing values
df["Review"].dropna(inplace=True)

# Step 2: Convert to lowercase
df["Review"] = df["Review"].str.lower()

# Step 3: Remove punctuation and special characters
df["Review"] = df["Review"].apply(lambda text: text.translate(str.maketrans("", "", string.punctuation)))

# Step 4: Remove numbers
df["Review"] = df["Review"].apply(lambda text: re.sub(r'\d+', '', text))

# Step 5: Expand contractions (e.g., "isn't" → "is not")
df["Review"] = df["Review"].apply(lambda text: fix(text))

# Step 6: Remove stopwords
stop_words = set(stopwords.words("english"))
df["Review"] = df["Review"].apply(lambda text: " ".join([word for word in word_tokenize(text) if word not in stop_words]))

# Step 7: Correct spelling mistakes
spell = Speller(lang='en')
df["Review"] = df["Review"].apply(lambda text: " ".join([spell(word) for word in text.split()]))

# Step 8: Lemmatization
lemmatizer = WordNetLemmatizer()

# Function to get wordnet POS tag
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Lemmatization function
def lemmatize_text(text):
    words = word_tokenize(text)
    pos_tags = pos_tag(words)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    return " ".join(lemmatized_words)

df["Review"] = df["Review"].apply(lemmatize_text)

# Save the cleaned data to a new CSV file
df.to_csv("UNITENReview_Cleaned.csv", index=False)

print("Text pre-processing completed. Cleaned file saved as 'UNITENReview_Cleaned.csv'.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Username\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Text pre-processing completed. Cleaned file saved as 'UNITENReview_Cleaned.csv'.
