In [1]:
import re

# Define a regex pattern to match words with "c" or "h" followed by "a" and "t"
pattern = r"[ch]at"

# List of words to test
words = ["that", "at", "chat", "cat", "fat", "phat"]

# Find and print words that match the pattern
matching_words = [word for word in words if re.search(pattern, word)]

print("Matching words:", matching_words)

Matching words: ['that', 'chat', 'cat', 'phat']


In [2]:
import re  # Imports the 're' module, which provides support for regular expressions in Python. This will allow pattern matching and text manipulation.
import pandas as pd  # Imports the 'pandas' library, which is essential for data manipulation, particularly with structured data like CSV files.

df = pd.read_csv('reviews.csv')  # Reads a CSV file called 'reviews.csv' into a pandas DataFrame (df). Each row represents a review in the dataset. Ensure the CSV file is in the same directory or provide the full path to it.

def tokenize_text(text):  # Defines a function called 'tokenize_text' that takes a text input (string).
    return re.findall(r'\w+', text)  # Uses 're.findall()' to match all sequences of word characters (letters, digits, and underscores) in the text. '\w+' is the regex pattern used to find these sequences, treating them as tokens.

df['tokens'] = df['review_text'].apply(tokenize_text)  # Applies the 'tokenize_text' function to each row of the 'review_text' column in the DataFrame, creating a new column 'tokens' that contains the list of tokens (words) for each review.

print(df['tokens'])  # Prints the 'tokens' column from the DataFrame, which contains the tokenized version of each review.

0     [The, laptop, s, battery, life, is, outstandin...
1     [Terrible, customer, service, My, issue, wasn,...
2     [Fast, shipping, but, the, packaging, was, dam...
3     [I, love, the, sleek, design, of, the, phone, ...
4     [The, software, is, buggy, and, crashes, frequ...
5     [The, restaurant, ambiance, was, nice, but, th...
6     [Absolutely, fantastic, headphones, Great, sou...
7     [The, movie, was, a, complete, waste, of, time...
8     [Bought, this, for, my, son, he, enjoys, it, a...
9     [The, hotel, staff, was, extremely, friendly, ...
10    [Poor, quality, fabric, not, worth, the, price...
11    [Had, a, great, time, at, the, amusement, park...
12    [The, app, interface, is, intuitive, and, easy...
13    [The, concert, was, amazing, but, the, seating...
14    [This, book, is, a, masterpiece, full, of, ins...
15    [Received, a, defective, product, had, to, ret...
16    [The, vacuum, cleaner, is, very, powerful, but...
17    [Great, experience, at, the, car, dealersh

In [3]:
import re  # Imports the 're' module for working with regular expressions, which are used for pattern matching and text manipulation.
import pandas as pd  # Imports the 'pandas' library, which is used for data manipulation, especially for reading and working with structured data like CSV files.

df = pd.read_csv('reviews.csv')  # Reads a CSV file called 'reviews.csv' into a pandas DataFrame (df). This file is expected to contain a column named 'review_text' with the reviews.

def clean_text(text):  # Defines a function 'clean_text' that takes a string input (text).
    # First, re.sub('[^0-9a-zA-Z]+', ' ', text.lower()) replaces any character that is not alphanumeric (a-z, A-Z, 0-9) with a space and converts the text to lowercase.
    # Second, re.sub(' +', ' ', ...) ensures that multiple spaces are reduced to a single space.
    cleaned_text = re.sub(' +', ' ', re.sub('[^0-9a-zA-Z]+', ' ', text.lower()))
    return cleaned_text  # Returns the cleaned text.

df['cleaned_text'] = df['review_text'].apply(clean_text)  # Applies the 'clean_text' function to each entry in the 'review_text' column and creates a new column called 'cleaned_text' with the cleaned version of the review.

print(df['cleaned_text'])  # Prints the 'cleaned_text' column, which contains the processed, cleaned text from each review.

0     the laptop s battery life is outstanding lasti...
1      terrible customer service my issue wasn t res...
2     fast shipping but the packaging was damaged up...
3     i love the sleek design of the phone it s ligh...
4     the software is buggy and crashes frequently v...
5     the restaurant ambiance was nice but the food ...
6     absolutely fantastic headphones great sound qu...
7     the movie was a complete waste of time plot wa...
8     bought this for my son he enjoys it a lot very...
9     the hotel staff was extremely friendly and acc...
10    poor quality fabric not worth the price wouldn...
11    had a great time at the amusement park the rid...
12    the app interface is intuitive and easy to nav...
13    the concert was amazing but the seating was to...
14    this book is a masterpiece full of insightful ...
15    received a defective product had to return it ...
16    the vacuum cleaner is very powerful but quite ...
17    great experience at the car dealership the

In [15]:
import re  # Imports the 're' module to work with regular expressions, which will be used to search for specific patterns in the text.
import pandas as pd  # Imports the 'pandas' library for reading and manipulating structured data, particularly CSV files.

df = pd.read_csv('reviews.csv')  # Reads the 'reviews.csv' file into a pandas DataFrame called 'df'. This file is expected to have a column 'review_text' containing text to analyze.

# Define a dictionary 'patterns' where keys are entity types ('PERSON' and 'LOCATION') and values are regex patterns to match specific names and locations.
patterns = {
    'ELECTRONIC DEVICE': r'(laptop|battery|phone)', 
    'LOCATION': r'(restaurant|concert)'
}

named_entities = {}  # Initialize an empty dictionary to store the named entities found in the text.

# Iterate through each entity type (PERSON, LOCATION) and its associated regex pattern.
for entity, pattern in patterns.items():
    # Define a function 'find_entities' that uses re.findall() to find all occurrences of the entity based on the regex pattern in the input text.
    def find_entities(text):
        return re.findall(pattern, text)
    
    # Apply the 'find_entities' function to the 'review_text' column in the DataFrame. 
    # This creates a new column (PERSON or LOCATION) in the DataFrame containing lists of matched entities for each review.
    df[entity] = df['review_text'].apply(find_entities)
    
    # Add the found entities to the 'named_entities' dictionary, storing the lists from each column into the dictionary under the corresponding entity type.
    named_entities[entity] = df[entity].tolist()

# Print the 'named_entities' dictionary, which contains the extracted PERSON and LOCATION entities from the reviews.
print(named_entities)

{'ELECTRONIC DEVICE': [['laptop', 'battery'], [], [], ['phone'], [], [], ['phone'], [], [], [], [], [], [], [], [], [], [], [], [], ['battery']], 'LOCATION': [[], [], [], [], [], ['restaurant'], [], [], [], [], [], [], [], ['concert'], [], [], [], [], [], []]}


In [17]:
import re  # Imports the 're' module for working with regular expressions.
import pandas as pd  # Imports the 'pandas' library for reading and manipulating structured data like CSV files.

# Reads the 'reviews.csv' file into a pandas DataFrame called 'df'. This file should contain a column named 'review_text' with textual data.
df = pd.read_csv('reviews.csv')

# Joins all the text from the 'review_text' column into a single string, separating the reviews with spaces.
text = " ".join(df['review_text'])

# Finds all occurrences of the pattern 'f.x', where 'f' is followed by any character (dot '.' matches any single character), and 'x' is the final character.
result_dot = re.findall(r' a.p', text)

# Finds all occurrences of 'fascinating' followed by zero or more characters ('.*' allows for matching any sequence of characters after 'fascinating').
result_star = re.findall(r'fascinating.*', text)

# Finds all occurrences of 'one' followed by one or more characters ('.+' ensures there is at least one character after 'one').
result_plus = re.findall(r'one.+', text)

# Finds all occurrences of the letter 'm' followed by zero or one character ('m.?' allows for 'm' followed by any single character or nothing at all).
result_question = re.findall(r'm.?', text)

# Finds all occurrences of either 'fox' or 'dog' in the text. The pipe '|' acts as an OR operator, matching either 'fox' or 'dog'.
result_pipe = re.findall(r'fox|dog', text)

# Prints the results for each regular expression search with a label to indicate which metacharacter was used.
print("Dot metacharacter: 'a.p'", result_dot) 
print("\nStar metacharacter: 'fascinating.*'", result_star) 
print("\nPlus metacharacter: 'one.+'", result_plus) 
print("\nQuestion mark metacharacter: 'm.?'", result_question)  
print("\nPipe metacharacter: 'fox|dog'", result_pipe)  

Dot metacharacter: 'a.p' [' app']

Star metacharacter: 'fascinating.*' []

Plus metacharacter: 'one.+' ["one :) it's lightweight and powerful! The software is buggy @@ and crashes frequently... very disappointed :( The restaurant ambiance was nice :) but the food was mediocre *at best*. Absolutely fantastic headphones! Great sound quality and comfortable fit :D ^_^. The movie was a complete waste of time ~ plot was predictable and dull :|. Bought this for my son :) he enjoys it a lot, very kid-friendly! The hotel staff was extremely friendly :) and accommodating! #TopNotch Poor quality fabric, not worth the price :( Wouldn't recommend! Had a *great* time at the amusement park, the rides were thrilling :D! The app interface is ^intuitive^ and easy to navigate :). The concert was amazing, but the seating was too cramped :/ ~ could have been better. This book is a *masterpiece*, full of insightful thoughts and stories :D Received a defective product @@, had to return it immediately :( #Fr

In [11]:
import re  # Imports the 're' module for working with regular expressions.
import pandas as pd  # Imports the 'pandas' library for reading and manipulating structured data like CSV files.

# Reads the 'reviews.csv' file into a pandas DataFrame called 'df'. This file should contain a column named 'review_text' with textual data.
df = pd.read_csv('reviews.csv')

# Joins all the text from the 'review_text' column into a single string, separating the reviews with spaces.
text = " ".join(df['review_text'])

# Finds all occurrences of the pattern 'f.x', where 'f' is followed by any character (dot '.' matches any single character), and 'x' is the final character.
result_dot = re.findall(r'f.x', text)

# Finds all occurrences of 'fascinating' followed by zero or more characters ('.*' allows for matching any sequence of characters after 'fascinating').
result_star = re.findall(r'fascinating.*', text)

# Finds all occurrences of 'one' followed by one or more characters ('.+' ensures there is at least one character after 'one').
result_plus = re.findall(r'one.+', text)

# Finds all occurrences of the letter 'm' followed by zero or one character ('m.?' allows for 'm' followed by any single character or nothing at all).
result_question = re.findall(r'm.?', text)

# Finds all occurrences of either 'fox' or 'dog' in the text. The pipe '|' acts as an OR operator, matching either 'fox' or 'dog'.
result_pipe = re.findall(r'fox|dog', text)

# Prints the results for each regular expression search with a label to indicate which metacharacter was used.
print("Dot metacharacter: 'f.x'", result_dot) 
print("\nStar metacharacter: 'fascinating.*'", result_star) 
print("\nPlus metacharacter: 'one.+'", result_plus) 
print("\nQuestion mark metacharacter: 'm.?'", result_question)  
print("\nPipe metacharacter: 'fox|dog'", result_pipe)  

Dot metacharacter: 'f.x' []

Star metacharacter: 'fascinating.*' []

Plus metacharacter: 'one.+' ["one :) it's lightweight and powerful! The software is buggy @@ and crashes frequently... very disappointed :( The restaurant ambiance was nice :) but the food was mediocre *at best*. Absolutely fantastic headphones! Great sound quality and comfortable fit :D ^_^. The movie was a complete waste of time ~ plot was predictable and dull :|. Bought this for my son :) he enjoys it a lot, very kid-friendly! The hotel staff was extremely friendly :) and accommodating! #TopNotch Poor quality fabric, not worth the price :( Wouldn't recommend! Had a *great* time at the amusement park, the rides were thrilling :D! The app interface is ^intuitive^ and easy to navigate :). The concert was amazing, but the seating was too cramped :/ ~ could have been better. This book is a *masterpiece*, full of insightful thoughts and stories :D Received a defective product @@, had to return it immediately :( #Frustrat

In [12]:
import re  # Imports the 're' module for working with regular expressions.
import pandas as pd  # Imports the 'pandas' library for reading and manipulating structured data like CSV files.

# Reads the 'reviews.csv' file into a pandas DataFrame called 'df'. This file should contain a column named 'review_text' with textual data.
df = pd.read_csv('reviews.csv')

# Joins all the text from the 'review_text' column into a single string, separating the reviews with spaces.
text = " ".join(df['review_text'])

# Matches any sequence where 't' is followed by any two characters and then another 't'. This matches patterns like 'test', 'text', etc.
result_dot = re.findall(r't..t', text)

# Matches sequences starting with 't', followed by zero or more lowercase letters (a-z), and ending with 't'. This matches patterns like 'test', 'text', 'tart', etc.
result_star = re.findall(r't[a-z]*t', text)

# Matches sequences that start with 'P', followed by one or more lowercase letters. This matches patterns like 'Python', 'Program', etc.
result_plus = re.findall(r'P[a-z]+', text)

# Matches the literal string 'regexp?' where 'p' is optional (i.e., zero or one occurrence). This would match both 'regex' and 'regexp'.
result_question = re.findall(r'regexp\?', text)

# Matches any word boundary (`\b`) followed by exactly 5 alphabetic characters (both lowercase and uppercase). This matches words like 'hello', 'world', etc.
result_n = re.findall(r'\b[a-zA-Z]{5}\b', text)

# Matches words that are at least 3 characters long, starting and ending with word boundaries (`\b`). This matches words with 3 or more characters.
result_n_min = re.findall(r'\b[a-zA-Z]{3,}\b', text)

# Matches words that are up to 4 characters long, defined by word boundaries (`\b`). This matches words with a maximum of 4 characters.
result_m_max = re.findall(r'\b[a-zA-Z]{,4}\b', text)

# Matches words that start with 's' and are followed by 1 to 5 alphabetic characters, again defined by word boundaries. This matches words like 'save', 'smart', etc.
result_n_m = re.findall(r'\bs[a-zA-Z]{1,5}\b', text)

# Print the results for each regular expression search with an explanation of the quantifier used.
print("Dot quantifier: 't..t'\n", result_dot)  # Prints the words matching the dot quantifier pattern 't..t'
print("\nStar quantifier: 't[a-z]*t'\n", result_star)  # Prints the words matching the star quantifier pattern 't[a-z]*t'
print("\nPlus quantifier: 'P[a-z]+'\n", result_plus)  # Prints the words matching the plus quantifier pattern 'P[a-z]+'
print("\nQuestion mark quantifier: 'regexp?'\n", result_question)  # Prints the words matching the question mark quantifier pattern 'regexp?'
print("\n{n} quantifier: '[a-zA-Z]{5}'\n", result_n)  # Prints the words matching exactly 5 characters using the '{n}' quantifier.
print("\n{n,} quantifier: '[a-zA-Z]{3,}'\n", result_n_min)  # Prints the words with at least 3 characters using the '{n,}' quantifier.
print("\n{,m} quantifier: '[a-zA-Z]{,4}'\n", result_m_max)  # Prints the words with at most 4 characters using the '{,m}' quantifier.
print("\n{n,m} quantifier: 's[a-zA-Z]{1,5}'\n", result_n_m)  # Prints the words starting with 's' and with 1 to 5 characters using the '{n,m}' quantifier.

Dot quantifier: 't..t'
 ['tast', 'th t', 't* t', 'tuit', 'trat', 'th t', 'twat']

Star quantifier: 't[a-z]*t'
 ['tt', 'tst', 'tweight', 'taurant', 'tast', 'tuit', 'tt', 'thought', 'trat', 'twat', 'tt']

Plus quantifier: 'P[a-z]+'
 ['Poor']

Question mark quantifier: 'regexp?'
 []

{n} quantifier: '[a-zA-Z]{5}'
 ['hours', 'issue', 'after', 'calls', 'sleek', 'phone', 'buggy', 'Great', 'sound', 'movie', 'waste', 'hotel', 'staff', 'worth', 'price', 'great', 'rides', 'could', 'quite', 'noisy', 'Great', 'staff', 'happy', 'could']

{n,} quantifier: '[a-zA-Z]{3,}'
 ['The', 'laptop', 'battery', 'life', 'outstanding', 'lasting', 'over', 'hours', 'Terrible', 'customer', 'service', 'issue', 'wasn', 'resolved', 'after', 'multiple', 'calls', 'Fast', 'shipping', 'but', 'the', 'packaging', 'was', 'damaged', 'upon', 'arrival', 'Disappointing', 'love', 'the', 'sleek', 'design', 'the', 'phone', 'lightweight', 'and', 'powerful', 'The', 'software', 'buggy', 'and', 'crashes', 'frequently', 'very', 'disappoi

In [13]:
import re  # Imports the 're' module to work with regular expressions.
import pandas as pd  # Imports the 'pandas' library for reading and manipulating structured data like CSV files.

# Reads the 'reviews.csv' file into a pandas DataFrame called 'df'. This file should contain a column 'review_text' that includes the text data.
df = pd.read_csv('reviews.csv')

# Joins all the text from the 'review_text' column into a single string, separating the reviews with spaces.
text = " ".join(df['review_text'])

# Uses the \D shorthand class to match non-digit characters in the text. The re.findall() function returns a list of all non-digits, and len() counts the total.
non_digits_count = len(re.findall("\D", text))

# Uses the \d shorthand class to match all digits in the text. The re.findall() function returns a list of all digits, and len() counts the total.
digits_count = len(re.findall("\d", text))

# Uses the \W shorthand class to match all non-word characters (anything that is not alphanumeric, such as punctuation). len() counts the total non-word characters.
non_words_count = len(re.findall("\W", text))

# Uses the \w shorthand class to match all word characters (alphanumeric and underscores). len() counts the total word characters.
words_count = len(re.findall("\w", text))

# Uses the \s shorthand class to match all whitespace characters (spaces, tabs, newlines). len() counts the total whitespace characters.
spaces_count = len(re.findall("\s", text))

# Prints the count of non-digit characters (anything that isn't a number).
print("Non-digits count:", non_digits_count)

# Prints the count of digits (numbers) in the text.
print("Digits count:", digits_count)

# Prints the count of non-word characters (e.g., punctuation marks, symbols).
print("Non-words count:", non_words_count)

# Prints the count of word characters (letters, digits, and underscores).
print("Words count:", words_count)

# Prints the count of spaces (whitespace) in the text.
print("Spaces count:", spaces_count)

Non-digits count: 1454
Digits count: 2
Non-words count: 341
Words count: 1115
Spaces count: 240


In [14]:
import re  # Imports the 're' module for working with regular expressions.
import pandas as pd  # Imports the 'pandas' library for reading and manipulating structured data like CSV files.

# Reads the 'reviews.csv' file into a pandas DataFrame called 'df'. The 'review_text' column is expected to contain the text to be analyzed.
df = pd.read_csv('reviews.csv')

# Joins all the text from the 'review_text' column into a single string, separating each review with a newline character (\n).
text = "\n".join(df['review_text'])

# Compiles a regular expression that looks for the phrase "cat named" followed by a word (\w+), capturing the name of the cat.
cat_regex = re.compile(r"cat named (\w+)")

# Searches the text for the first occurrence of "cat named" followed by a word, and stores the match in 'cat_match'.
cat_match = cat_regex.search(text)

# Compiles a regular expression that looks for the phrase "dog named" followed by a word (\w+), capturing the name of the dog.
dog_regex = re.compile(r"dog named (\w+)")

# Searches the text for the first occurrence of "dog named" followed by a word, and stores the match in 'dog_match'.
dog_match = dog_regex.search(text)

# Compiles a regular expression to search for "black" or "brown and white", using a non-capturing group (?:) to group the colors without storing the match.
color_regex = re.compile(r"(?:black|brown and white)")

# Finds all occurrences of "black" or "brown and white" in the text and stores them in 'color_matches' as a list.
color_matches = color_regex.findall(text)

# Prints the result of the capturing group for the cat's name match.
print("Capturing group result 1:", cat_match)

# Extracts and prints the captured group (cat's name) from 'cat_match' using the group() function.
print(f"John's cat's name is {cat_match.group(1)}\n")

# Prints the result of the capturing group for the dog's name match.
print("Capturing group result 2:", dog_match)

# Extracts and prints the captured group (dog's name) from 'dog_match' using the group() function.
print(f"John's dog's name is {dog_match.group(1)}\n")

# Prints the result of the non-capturing group, which contains all matches of the color patterns in the text.
print("Non-capturing group result:", color_matches)

# Joins the color matches into a string and prints them as a list of colors found in the text.
print(f"The colors of John's pets are: {', '.join(color_matches)}")

Capturing group result 1: None


AttributeError: 'NoneType' object has no attribute 'group'