In [26]:
import spacy
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# Load necessary resources
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kyana\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kyana\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kyana\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [27]:
# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
nlp = spacy.load('en_core_web_sm')

def preprocess_text(data):
    preprocessed_data = []
    for review in data['Review']:
        # Convert to lowercase
        review = review.lower()
        # Tokenize
        tokens = word_tokenize(review)
        # Remove punctuation
        tokens = [word for word in tokens if word.isalnum()]
        # Lemmatize
        tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
        preprocessed_data.append(' '.join(tokens))
    return preprocessed_data

# Sample dataset
data = {
    'Review': [
        'At McDonald\'s the food was ok and the service was bad.',
        'I would not recommend this Japanese restaurant to anyone.',
        'I loved this restaurant when I traveled to Thailand last summer.',
        'The menu of Loving has a wide variety of options.',
        'The staff was friendly and helpful at Google\'s employees restaurant.',
        'The ambiance at Bella Italia is amazing, and the pasta dishes are delicious.',
        'I had a terrible experience at Pizza Hut. The pizza was burnt, and the service was slow.',
        'The sushi at Sushi Express is always fresh and flavorful.',
        'The steakhouse on Main Street has a cozy atmosphere and excellent steaks.',
        'The dessert selection at Sweet Treats is to die for!'
    ]
}

# Apply the preprocess_text function and print the result
preprocessed_data = preprocess_text(data)
print(preprocessed_data)


['mcdonald food ok service bad', 'would recommend japanese restaurant anyone', 'loved restaurant traveled thailand last summer', 'menu loving wide variety option', 'staff friendly helpful google employee restaurant', 'ambiance bella italia amazing pasta dish delicious', 'terrible experience pizza hut pizza burnt service slow', 'sushi sushi express always fresh flavorful', 'steakhouse main street cozy atmosphere excellent steak', 'dessert selection sweet treat die']


In [28]:
import pandas as pd

# Create a new dataset with the cleaned text
cleaned_data = pd.DataFrame({
    'Review': data['Review'],
    'Cleaned_Review': preprocessed_data
})
print(cleaned_data)

                                              Review  \
0  At McDonald's the food was ok and the service ...   
1  I would not recommend this Japanese restaurant...   
2  I loved this restaurant when I traveled to Tha...   
3  The menu of Loving has a wide variety of options.   
4  The staff was friendly and helpful at Google's...   
5  The ambiance at Bella Italia is amazing, and t...   
6  I had a terrible experience at Pizza Hut. The ...   
7  The sushi at Sushi Express is always fresh and...   
8  The steakhouse on Main Street has a cozy atmos...   
9  The dessert selection at Sweet Treats is to di...   

                                      Cleaned_Review  
0                       mcdonald food ok service bad  
1         would recommend japanese restaurant anyone  
2     loved restaurant traveled thailand last summer  
3                    menu loving wide variety option  
4  staff friendly helpful google employee restaurant  
5  ambiance bella italia amazing pasta dish delic... 

In [29]:
def perform_ner(text):
    doc = nlp(text)
    return [(ent.text, ent.label_) for ent in doc.ents]

# Apply the perform_ner function and print the results
ner_results = cleaned_data['Cleaned_Review'].apply(perform_ner)
print(ner_results)

0         [(mcdonald food ok service, ORG)]
1                        [(japanese, NORP)]
2    [(thailand, GPE), (last summer, DATE)]
3                                        []
4                           [(google, ORG)]
5            [(ambiance bella italia, ORG)]
6                                        []
7           [(sushi sushi express, PERSON)]
8                                        []
9                                        []
Name: Cleaned_Review, dtype: object


In [30]:
nltk.download('averaged_perceptron_tagger')

def perform_pos_tagging(text):
    tokens = word_tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    return pos_tags

# Apply the perform_pos_tagging function and print the results
pos_results = cleaned_data['Cleaned_Review'].apply(perform_pos_tagging)
print(pos_results)

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\kyana\AppData\Roaming\nltk_data...


0    [(mcdonald, NNS), (food, NN), (ok, JJ), (servi...
1    [(would, MD), (recommend, VB), (japanese, JJ),...
2    [(loved, VBN), (restaurant, NN), (traveled, VB...
3    [(menu, NN), (loving, VBG), (wide, JJ), (varie...
4    [(staff, NN), (friendly, RB), (helpful, JJ), (...
5    [(ambiance, NN), (bella, NN), (italia, NN), (a...
6    [(terrible, JJ), (experience, NN), (pizza, NN)...
7    [(sushi, NN), (sushi, NN), (express, NN), (alw...
8    [(steakhouse, NN), (main, JJ), (street, NN), (...
9    [(dessert, JJ), (selection, NN), (sweet, JJ), ...
Name: Cleaned_Review, dtype: object


[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


In [31]:
# Apply NER and POS tagging to both raw and cleaned data
cleaned_data['NER_Raw'] = cleaned_data['Review'].apply(perform_ner)
cleaned_data['NER_Cleaned'] = cleaned_data['Cleaned_Review'].apply(perform_ner)
cleaned_data['POS_Raw'] = cleaned_data['Review'].apply(perform_pos_tagging)
cleaned_data['POS_Cleaned'] = cleaned_data['Cleaned_Review'].apply(perform_pos_tagging)

# Print the dataset with NER and POS results
print(cleaned_data)


                                              Review  \
0  At McDonald's the food was ok and the service ...   
1  I would not recommend this Japanese restaurant...   
2  I loved this restaurant when I traveled to Tha...   
3  The menu of Loving has a wide variety of options.   
4  The staff was friendly and helpful at Google's...   
5  The ambiance at Bella Italia is amazing, and t...   
6  I had a terrible experience at Pizza Hut. The ...   
7  The sushi at Sushi Express is always fresh and...   
8  The steakhouse on Main Street has a cozy atmos...   
9  The dessert selection at Sweet Treats is to di...   

                                      Cleaned_Review  \
0                       mcdonald food ok service bad   
1         would recommend japanese restaurant anyone   
2     loved restaurant traveled thailand last summer   
3                    menu loving wide variety option   
4  staff friendly helpful google employee restaurant   
5  ambiance bella italia amazing pasta dish del