In [1]:
import pandas as pd  # Import the pandas library for data manipulation and analysis, specifically for reading and handling CSV files.
import re  # Import the regular expression (re) library for pattern matching and text manipulation.

# Read the CSV file 'reviews.csv' into a Pandas DataFrame. Make sure the CSV file is in the same directory as the script, or provide the correct path.
df = pd.read_csv('reviews.csv') 

# Define a function to remove special characters from a string (text). This function will be applied to the text data.
def remove_special_characters(text):
    # Use the re.sub() function to replace any character that is not a letter (a-z, A-Z), number (0-9), or whitespace with an empty string.
    # This removes punctuation and special characters from the text.
    clean_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return clean_text  # Return the cleaned text without special characters.

# Apply the remove_special_characters function to the 'review_text' column in the DataFrame.
# This creates a new column 'clean_text' containing the processed reviews with special characters removed.
df['clean_text'] = df['review_text'].apply(remove_special_characters)

# Print the cleaned text column to see the result.
print(df['clean_text']) 

# Command to run the code in the terminal (assuming this script is saved as 'clean_reviews.py'):
# python clean_reviews.py

0     The laptops battery life is outstanding lastin...
1     Terrible customer service My issue wasnt resol...
2     Fast shipping but the packaging was damaged up...
3     I love the sleek design of the phone  its ligh...
4     The software is buggy  and crashes frequently ...
5     The restaurant ambiance was nice  but the food...
6     Absolutely fantastic headphones Great sound qu...
7     The movie was a complete waste of time  plot w...
8     Bought this for my son  he enjoys it a lot ver...
9     The hotel staff was extremely friendly  and ac...
10    Poor quality fabric not worth the price  Would...
11    Had a great time at the amusement park the rid...
12    The app interface is intuitive and easy to nav...
13    The concert was amazing but the seating was to...
14    This book is a masterpiece full of insightful ...
15    Received a defective product  had to return it...
16    The vacuum cleaner is very powerful but quite ...
17    Great experience at the car dealership  th

In [4]:
# Perform standard imports
import spacy

# Load English Language Model
nlp = spacy.load('en_core_web_sm')

# Input string
string = "This is the first sentence. This is the second sentence. This is the third sentence."

# Process the string with spaCy NLP pipeline
doc = nlp(string)

# Loop through the sentences and print them
for sent in doc.sents:
    print(sent)

This is the first sentence.
This is the second sentence.
This is the third sentence.


In [5]:
# Input string
string = "This is the first sentence. This is the second sentence. This is the third sentence."

# Split the string into sentences using the period as a delimiter
sentences = string.split('.')

# Remove any leading/trailing whitespace and filter out empty sentences
sentences = [sentence.strip() for sentence in sentences if sentence.strip()]

# Print each sentence
for sentence in sentences:
    print(sentence)

This is the first sentence
This is the second sentence
This is the third sentence


In [13]:
import nltk
nltk.download('punkt')  # Download the correct 'punkt' resource

[nltk_data] Downloading package punkt to /Users/sangvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [27]:
import nltk
from nltk.tokenize import word_tokenize, regexp_tokenize

# Download the 'punkt' tokenizer model 
nltk.download('punkt')

# Example text
text = "Tokenization involves splitting text into smaller units or tokens, such as words or phrases. Special cases like contractions (e.g., don't) or possessive forms (e.g., John's book) require careful handling."

# Basic tokenization
tokens = word_tokenize(text)
print("Basic Tokenization:")
print(tokens)

# Handling contractions and possessives with regex
# This regex splits on spaces but keeps contractions and possessives together
custom_tokenizer = r"\w+\'\w+|\w+"
custom_tokens = regexp_tokenize(text, custom_tokenizer)

print("\nCustom Tokenization with handling contractions and possessive forms:")
print(custom_tokens)

[nltk_data] Downloading package punkt to /Users/sangvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


LookupError: 
**********************************************************************
  Resource [93mpunkt_tab[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt_tab')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt_tab/english/[0m

  Searched in:
    - '/Users/sangvu/nltk_data'
    - '/Users/sangvu/miniconda3/envs/llm/nltk_data'
    - '/Users/sangvu/miniconda3/envs/llm/share/nltk_data'
    - '/Users/sangvu/miniconda3/envs/llm/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - '/your/custom/nltk_data/path'
    - '/Users/sangvu/miniconda3/envs/llm/lib/nltk_data'
**********************************************************************


In [33]:
import spacy
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Load spaCy English model
nlp = spacy.load('en_core_web_sm')

# Example text
text = "An NLP pipeline processes raw text. It organizes the text in a way that computers can understand. This helps with tasks like figuring out emotions in text, translating languages, and answering questions."

# Step 1: Sentence Segmentation (using spaCy)
doc = nlp(text)
sentences = [sent.text for sent in doc.sents]
print("Sentence Segmentation:")
for sentence in sentences:
    print(sentence)

# Step 2: Tokenization (using spaCy)
tokens = [token.text for token in doc]
print("\nTokenization:")
print(tokens)

# Step 3: Text Normalization (Lowercasing)
normalized_tokens = [token.lower() for token in tokens]
print("\nText Normalization (Lowercasing):")
print(normalized_tokens)

# Step 4: Stopword Removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in normalized_tokens if token not in stop_words]
print("\nStopword Removal:")
print(filtered_tokens)

# Step 5: Text Cleaning (Removing punctuation)
translator = str.maketrans('', '', string.punctuation)
cleaned_tokens = [token.translate(translator) for token in filtered_tokens]
print("\nText Cleaning (Removing punctuation):")
print(cleaned_tokens)

# Step 6: Stemming and Lemmatization
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in cleaned_tokens]
print("\nStemming:")
print(stemmed_tokens)

lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in cleaned_tokens]
print("\nLemmatization:")
print(lemmatized_tokens)

[nltk_data] Downloading package punkt to /Users/sangvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sangvu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/sangvu/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Sentence Segmentation:
An NLP pipeline processes raw text.
It organizes the text in a way that computers can understand.
This helps with tasks like figuring out emotions in text, translating languages, and answering questions.

Tokenization:
['An', 'NLP', 'pipeline', 'processes', 'raw', 'text', '.', 'It', 'organizes', 'the', 'text', 'in', 'a', 'way', 'that', 'computers', 'can', 'understand', '.', 'This', 'helps', 'with', 'tasks', 'like', 'figuring', 'out', 'emotions', 'in', 'text', ',', 'translating', 'languages', ',', 'and', 'answering', 'questions', '.']

Text Normalization (Lowercasing):
['an', 'nlp', 'pipeline', 'processes', 'raw', 'text', '.', 'it', 'organizes', 'the', 'text', 'in', 'a', 'way', 'that', 'computers', 'can', 'understand', '.', 'this', 'helps', 'with', 'tasks', 'like', 'figuring', 'out', 'emotions', 'in', 'text', ',', 'translating', 'languages', ',', 'and', 'answering', 'questions', '.']

Stopword Removal:
['nlp', 'pipeline', 'processes', 'raw', 'text', '.', 'organize

In [30]:
import nltk
nltk.download('punkt')  # This should download the correct 'punkt' model

[nltk_data] Downloading package punkt to /Users/sangvu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True