**Task 1:** sentiment analysis on Yelp reviews using the VADER sentiment analyzer. 
The goal is to analyze Yelp reviews for sentiment and store the results.


In [None]:
#Import necessary libraries
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

#Load the Yelp reviews dataset 
reviews_df = pd.read_csv("/Users/kaykaydaou/Desktop/MMA/MMA WINTER 25/W1 TEXT ANALYTICS/Labs/lab 3/yelp_reviews.csv")

#Initialize VADER sentiment analyzer 
sia = SentimentIntensityAnalyzer() 

#Function to calculate VADER sentiment for each review
def get_vader_sentiment(review):
    #Calculate sentiment using sia.polarity_scores(review)
    #Return the positive, negative, neutral, and compound sentiment scores
    sentiment = sia.polarity_scores(review)  
    return sentiment['pos'], sentiment['neg'], sentiment['neu'], sentiment['compound']  

#Apply the VADER sentiment analysis to the 'Review' column
#Using the DataFrame's 'apply' function to apply get_vader_sentiment to each review, then unpacking the scores into new columns ('pos', 'neg', 'neu', 'compound')
reviews_df['pos'], reviews_df['neg'], reviews_df['neu'], reviews_df['compound'] = reviews_df['Review'].apply(lambda x: pd.Series(get_vader_sentiment(x))) 

#Save the updated DataFrame with the VADER sentiment scores to a CSV file 
# Ensure the file is named 'yelp_reviews_vader.csv'
reviews_df.to_csv("yelp_reviews_vader.csv", index=False)


**Task 2:** preprocessing text and performing sentiment analysis using custom rules and WordNet similarity. The goal is to preprocess the text, extract meaningful phrases, and determine the sentiment of each review using custom rules and word similarity.


In [4]:
#Import necessary libraries
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn

# Initialize the lemmatizer and stop words list
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Preprocesses the input text by tokenizing, removing stopwords, and lemmatizing.
    """
    #Tokenize the text 
    tokens = word_tokenize(text)

    #Convert tokens to lowercase and remove non-alphanumeric characters
    #Filter out stop words and lemmatize the remaining tokens
    tokens = [lemmatizer.lemmatize(word.lower()) for word in tokens if word.isalnum() and word.lower() not in stop_words]
    return tokens

def extract_phrases(tokens):
    """
    Extracts phrases from the tokens based on the provided rules.
    """
    #POS-tag the tokens 
    pos_tagged = pos_tag(tokens)  
    phrases = []
    
    #Implement rules to extract phrases (use the given rules 1-5 as hints)
    for i in range(len(pos_tagged) - 2):
        word1, tag1 = pos_tagged[i]
        word2, tag2 = pos_tagged[i + 1]
        word3, tag3 = pos_tagged[i + 2] if i + 2 < len(pos_tagged) else None

        # Rule 1: JJ followed by NN or NNS
        if tag1.startswith('JJ') and (tag2 == 'NN' or tag2 == 'NNS'):
            phrases.append(f'{word1} {word2}')
        
        # Rule 2: RB, RBR, or RBS followed by JJ, not followed by NN or NNS
        elif (tag1 == 'RB' or tag1 == 'RBR' or tag1 == 'RBS') and tag2.startswith('JJ') and (tag3 != 'NN' and tag3 != 'NNS'):
            phrases.append(f'{word1} {word2}')
        
        # Rule 3: JJ followed by JJ, not followed by NN or NNS
        elif tag1.startswith('JJ') and tag2.startswith('JJ') and (tag3 != 'NN' and tag3 != 'NNS'):
            phrases.append(f'{word1} {word2}')
        
        # Rule 4: NN or NNS followed by JJ, not followed by NN or NNS
        elif (tag1 == 'NN' or tag1 == 'NNS') and tag2.startswith('JJ') and (tag3 != 'NN' and tag3 != 'NNS'):
            phrases.append(f'{word1} {word2}')
        
        # Rule 5: RB, RBR, or RBS followed by VB, VBD, VBN, or VBG
        elif (tag1 == 'RB' or tag1 == 'RBR' or tag1 == 'RBS') and (tag2 == 'VB' or tag2 == 'VBD' or tag2 == 'VBN' or tag2 == 'VBG'):
            phrases.append(f'{word1} {word2}')

    return phrases

def wordnet_similarity(word1, word2):
    """
    Calculates the similarity between two words using WordNet.
    """
    #Get synsets for the two words using wn.synsets
    synsets1 = wn.synsets(word1)  
    synsets2 = wn.synsets(word2)  

    if synsets1 and synsets2:
        #Calculate similarity using wn.wup_similarity
        return max((s1.wup_similarity(s2) or 0) for s1 in synsets1 for s2 in synsets2)

    return 0  # Return 0 if no similarity is found

# Define positive and negative reference words
positive_refs = ["delicious", "tasty", "amazing", "great", "wonderful", "fantastic", "excellent"]
negative_refs = ["disgusting", "bad", "terrible", "awful", "horrible", "inedible", "poor"]

def semantic_orientation(phrase):
    """
    Calculates the semantic orientation of a phrase by comparing it with
    multiple positive and negative reference words.
    """
    #Calculate the average similarity with positive and negative reference words
    words = phrase.split()
    pos_score = sum(max(wordnet_similarity(word, ref) for ref in positive_refs) for word in words)
    neg_score = neg_score = sum(max(wordnet_similarity(word, ref) for ref in negative_refs) for word in words)
    # Return the difference to get the orientation score
    return pos_score - neg_score

def analyze_sentiment(document):
    """
    Analyzes the sentiment of a document based on its phrases' semantic orientation
    and returns both the sentiment score and the sentiment label.
    """
    tokens = preprocess_text(document)
    phrases = extract_phrases(tokens)
    total_orientation = 0

    for phrase in phrases:
        total_orientation += semantic_orientation(phrase)

    # TODO: Assign sentiment label based on total_orientation
    sentiment_label = None  # Replace None with logic for assigning 'Positive' or 'Negative' label

    return total_orientation, sentiment_label  # Return both the sentiment score and label

#Load the dataset 
data = pd.read_csv("/Users/kaykaydaou/Desktop/MMA/MMA WINTER 25/W1 TEXT ANALYTICS/Labs/lab 3/yelp_reviews.csv")

#Apply sentiment analysis to the 'Review' column and calculate both the score and label
#Use the apply method with a lambda function to apply analyze_sentiment to each review
data[['Sentiment Score', 'Sentiment Label']] = data['Review'].apply(lambda x: pd.Series(analyze_sentiment(x)))  

# Save the results to a new CSV file 
data.to_csv("yelp_reviews_custom_sentiment.csv", index=False)



**Task 3:** preprocess Yelp reviews and calculate the Euclidean distances between them based on their term frequencies. 
The goal is to process the text data and then compute how close or far apart the reviews are from each other based on the words they contain.


In [8]:
#Import necessary libraries 
import pandas as pd
import nltk
import re
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import euclidean_distances

# Step 1: Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Step 2: Preprocess the text
def preprocess_text(text):
    """
    Preprocesses the input text by tokenizing, removing stopwords, lemmatizing, and cleaning punctuation.
    """
    #Remove special characters (Use re.sub to remove anything that's not a letter or space)
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(tokens)
    
#Load the dataset 
data = pd.read_csv("/Users/kaykaydaou/Desktop/MMA/MMA WINTER 25/W1 TEXT ANALYTICS/Labs/lab 3/yelp_reviews.csv")

# Step 3: Apply text preprocessing to the 'Review' column
data['Cleaned_Reviews'] = data['Review'].apply(preprocess_text) 

# Step 4: Convert documents to a term frequency matrix using CountVectorizer
vectorizer = CountVectorizer()
term_matrix = vectorizer.fit_transform(data['Cleaned_Reviews'])

# Step 5: Calculate the pairwise Euclidean distances between documents
distances = euclidean_distances(term_matrix)

# Step 6: Convert the pairwise distances into a matrix format
# Step 7: Convert the matrix into a DataFrame for easier viewing
distance_df = pd.DataFrame(distances, index=data.index, columns=data.index)

# Step 8: Save the DataFrame to a CSV file
distance_df.to_csv("yelp_reviews_euclidean_distances.csv", index=True)


**Task 4:** preprocess Yelp reviews and calculate the Cosine distances between them based on their term frequencies.
The goal is to preprocess the review text and then measure how similar or different the reviews are using Cosine distance.


In [9]:
#Import necessary libraries
import pandas as pd
import nltk
import re
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Initialize the lemmatizer and stop words
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Step 2: Preprocess the text
def preprocess_text(text):
    """
    Preprocesses the input text by tokenizing, removing stopwords, lemmatizing, and cleaning punctuation.
    """
    #Remove special characters 
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    tokens = word_tokenize(text)  # Tokenize text
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Remove stopwords and lemmatize
    return ' '.join(tokens)

#load the dataset
data = pd.read_csv("/Users/kaykaydaou/Desktop/MMA/MMA WINTER 25/W1 TEXT ANALYTICS/Labs/lab 3/yelp_reviews.csv")

# Step 3: Apply text preprocessing to the 'Review' column
data['Cleaned_Reviews'] = data['Review'].apply(preprocess_text)

# Step 4: Convert documents to a term frequency matrix using CountVectorizer
vectorizer = CountVectorizer()
term_matrix = vectorizer.fit_transform(data['Cleaned_Reviews'])

# Step 5: Calculate the pairwise Cosine similarities between documents
cosine_similarities = cosine_similarity(term_matrix) 

# Step 6: Convert Cosine similarities to Cosine distances (1 - Cosine Similarity)
cosine_distances = 1 - cosine_similarities

# Step 7: Convert the pairwise distances into a matrix format and then to a DataFrame for easier viewing
cosine_distance_df = pd.DataFrame(cosine_distances, index=data.index, columns=data.index)

# Step 8: Save the DataFrame to a CSV file
cosine_distance_df.to_csv("yelp_reviews_cosine_distances.csv", index=True)


**Task 5:** k-NN (k-Nearest Neighbors) model to classify the sentiment of Yelp reviews as either positive or negative. 
The goal is to classify the sentiment of reviews using a machine learning model and evaluate its accuracy.


In [None]:
# Incomplete k-NN Sentiment Classification Script for Students

# TODO: Import necessary libraries (Hint: You'll need pandas for data handling, re for regular expressions, and sklearn for machine learning functions)

# Step 1: Load your dataset
# TODO: Use pandas to load the 'yelp_reviews.csv' file into a DataFrame
reviews_df = None  # Replace None with the code to load the dataset

# Step 2: Convert ratings to binary sentiment (0 for < 3.5, 1 for >= 3.5)
# TODO: Use apply to create a new column 'Sentiment' based on the 'Rating' column
reviews_df['Sentiment'] = None  # Replace None with the lambda function for binary sentiment classification

# Step 3: Preprocess the text data (cleaning reviews)
def preprocess_text(text):
    """
    Preprocesses the input text by converting to lowercase, removing punctuation, and removing numbers.
    """
    # TODO: Convert text to lowercase
    text = None  # Replace None with the code to convert text to lowercase
    
    # TODO: Remove punctuation (Hint: Use re.sub to remove non-alphanumeric characters)
    text = None  # Replace None with the regex for removing punctuation
    
    # TODO: Remove numbers
    text = None  # Replace None with the regex for removing numbers
    
    return text

# TODO: Apply preprocessing to the 'Review' column (Hint: Use apply method to call preprocess_text for each review)
reviews_df['Cleaned_Review'] = None  # Replace None with the code to apply text preprocessing

# Step 4: Vectorize the cleaned reviews using CountVectorizer
# TODO: Initialize CountVectorizer (Hint: Use stop_words='english' to remove common stopwords)
vectorizer = None  # Replace None with CountVectorizer initialization

# TODO: Fit the vectorizer to the 'Cleaned_Review' column and transform the reviews into a feature matrix
X = None  # Replace None with the code to vectorize the cleaned reviews
y = reviews_df['Sentiment']  # Target variable (Sentiment)

# Step 5: Split the data into training and test sets (80% train, 20% test)
# TODO: Use train_test_split to split X and y into training and test sets (Hint: Set test_size=0.2 and random_state=42)
X_train, X_test, y_train, y_test = None  # Replace None with the train_test_split code

# Step 6: Apply k-NN model
# TODO: Initialize and train a k-NN classifier (Hint: Set n_neighbors=5 and metric='cosine')
knn = None  # Replace None with the k-NN initialization and training code

# Step 7: Evaluate the model
# TODO: Predict the sentiment on the test set
y_pred = None  # Replace None with the code to predict on X_test

# TODO: Generate a classification report and accuracy score (Hint: Use classification_report and accuracy_score from sklearn)
classification_report_output = None  # Replace None with the code to generate the classification report
accuracy = None  # Replace None with the code to calculate accuracy

# TODO: Print the classification report and accuracy
print("Classification Report:\n", None)  # Replace None with classification report variable
print(f"Accuracy: {None}")  # Replace None with accuracy variable

# Step 8: Predict sentiment for a new document
new_document = ["The product was absolutely terrible"]

# TODO: Vectorize the new document using the same vectorizer
new_doc_vector = None  # Replace None with the code to vectorize the new document

# TODO: Predict the sentiment of the new document
predicted_sentiment = None  # Replace None with the code to predict sentiment of new document
