## Practice 27/9/2024: Chapter 2 - Regular Expressions

In [3]:
import spacy
import nltk
from nltk.corpus import stopwords
import string
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

## 1. Basics of Regular Expressions

Regex helps in matching and replacing specific parts of a string based on predefined patterns. <br/>
**Ex**: define pattern then use re.search(patter, word) to match

In [18]:
import re

#Define a regex pattern to match words with "c" and "h" followed by "a" and "t"
pattern = r"[ch]at"

words = ["that", "at", "chat", "cat", "fat", "phat"] #words to test

matching_words = [word for word in words if re.search(pattern, word)] #matching_words are word in words if that word matches the pattern (using re.search)

print(matching_words)

['that', 'chat', 'cat', 'phat']


## 1.3 Regex in text preprocessing 
<strong>Applications of Regex in Text Preprocessing:</strong> <br>

**1. Tokenization** <br>
The \w+ regex pattern is used to match consecutive word characters (letters, digits, underscores), extracting tokens using ``re.findall()``. <br>
However, shortened words like "we're" may be split improperly and require full-form tokenization (e.g., "we are").

In [34]:
import re
import pandas as pd
df = pd.read_csv("reviews.csv")

#Uses 're.findall()' to match all sequences of word characters (letters, digits, and underscores) in the text. 
#'\w+' is the regex pattern used to find these sequences, treating them as tokens.
def tokenize_text(text):
    return re.findall(r'\w+',text) 

# Applies the 'tokenize_text' function to each row of the 'review_text' column in the DataFrame, 
# creating a new column 'tokens' that contains the list of tokens (words) for each review.
df['tokens'] = df['review_text'].apply(tokenize_text)


print(df['tokens'])


0     [The, laptop, s, battery, life, is, outstandin...
1     [Terrible, customer, service, My, issue, wasn,...
2     [Fast, shipping, but, the, packaging, was, dam...
3     [I, love, the, sleek, design, of, the, phone, ...
4     [The, software, is, buggy, and, crashes, frequ...
5     [The, restaurant, ambiance, was, nice, but, th...
6     [Absolutely, fantastic, headphones, Great, sou...
7     [The, movie, was, a, complete, waste, of, time...
8     [Bought, this, for, my, son, he, enjoys, it, a...
9     [The, hotel, staff, was, extremely, friendly, ...
10    [Poor, quality, fabric, not, worth, the, price...
11    [Had, a, great, time, at, the, amusement, park...
12    [The, app, interface, is, intuitive, and, easy...
13    [The, concert, was, amazing, but, the, seating...
14    [This, book, is, a, masterpiece, full, of, ins...
15    [Received, a, defective, product, had, to, ret...
16    [The, vacuum, cleaner, is, very, powerful, but...
17    [Great, experience, at, the, car, dealersh

**2. Text cleaning and normalization**

In [36]:
# First, re.sub('[^0-9a-zA-Z]+', ' ', text.lower()) replaces any character that is not alphanumeric (a-z, A-Z, 0-9) with a space and converts the text to lowercase.

# Second, re.sub(' +', ' ', ...) ensures that multiple spaces are reduced to a single space.
    
def clean_text(text):
    cleaned_text = re.sub(' +', ' ', re.sub('[^0-9a-zA-Z]',' ', text.lower()))
    return cleaned_text

df['cleaned_text'] = df['review_text'].apply(clean_text)

print(df['cleaned_text'])


0     the laptop s battery life is outstanding lasti...
1      terrible customer service my issue wasn t res...
2     fast shipping but the packaging was damaged up...
3     i love the sleek design of the phone it s ligh...
4     the software is buggy and crashes frequently v...
5     the restaurant ambiance was nice but the food ...
6     absolutely fantastic headphones great sound qu...
7     the movie was a complete waste of time plot wa...
8     bought this for my son he enjoys it a lot very...
9     the hotel staff was extremely friendly and acc...
10    poor quality fabric not worth the price wouldn...
11    had a great time at the amusement park the rid...
12    the app interface is intuitive and easy to nav...
13    the concert was amazing but the seating was to...
14    this book is a masterpiece full of insightful ...
15    received a defective product had to return it ...
16    the vacuum cleaner is very powerful but quite ...
17    great experience at the car dealership the

**3. Named entity recognition:**

In [37]:
patterns = {
    'NOUNS' : r'(software|restaurant|movie)',
    'PERSON': r'(Barack Obama|John Doe|Jane Smith)'
}

named_entities = {} 
for entity, pattern in patterns.items():
    def find_entities(text):
        return re.findall(pattern, text)
    df[entity] = df['review_text'].apply(find_entities)
    named_entities[entity] = df[entity].tolist()

print(named_entities)

{'NOUNS': [[], [], [], [], ['software'], ['restaurant'], [], ['movie'], [], [], [], [], [], [], [], [], [], [], [], []], 'PERSON': [[], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], [], []]}
