In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk import pos_tag
from collections import Counter
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
from datetime import datetime

In [2]:
data = pd.read_json('E:\Post ADP\Capstone Project\FYP\datasets\Cleaned_data_tokenized.json')

  data = pd.read_json('E:\Post ADP\Capstone Project\FYP\datasets\Cleaned_data_tokenized.json')


In [3]:
review_text = data['Review Text']
review_title = data['Review Title']
description = data['Description']
avg_rating = data['Average Rating']
features = data['Features']
Category = data['Category']
review_rating = data['Review Rating']
review_date = data['Review Date']

In [4]:
print(review_text.head())

0    using, Ninja, Call, Pro, Plus, smartwatch, mus...
1    using, Ninja, Call, Pro, Plus, smartwatch, mus...
2    Overall, product, good, two, things, would, li...
3    Affordable, come, variety, features, including...
4    call, functionality, seamless, clear, audio, e...
Name: Review Text, dtype: object


In [None]:
# convert text into numerical form (bag of words model)
vectorizer = CountVectorizer(stop_words='english')
matrix = vectorizer.fit_transform(review_text, review_title)

extracted_features = []

for i, row in data.iterrows(): 
    review = row["Review Text"].lower()
    review_date = row['Review Date']  
    

    features = {}
    # Polarity and subjectivity analysis (TextBlob model)
    Sentiment = SentimentIntensityAnalyzer()
    sentiment_score = Sentiment.polarity_scores(review)
    features['Positive_score'] = sentiment_score['pos']
    features['negative_score'] = sentiment_score['neg']
    features['neutral_score'] = sentiment_score['neu']
    features['sentiment_score'] = sentiment_score
    testimonial = TextBlob(review)
    features['subjectivity'] = testimonial.sentiment.subjectivity
    
    # Word-level sentiment analysis
    neg_word_count = 0
    pos_word_count = 0
    words = review.split()
    for word in words:
        word_polarity = TextBlob(word).sentiment.polarity
        if word_polarity < 0:
            neg_word_count += 1
        elif word_polarity > 0:
            pos_word_count += 1
    features['Negative_Count'] = neg_word_count
    features['Positive_Count'] = pos_word_count

    #  Count the Articles and negations 
    articles = ['a', 'an', 'the']
    negations = ['no', 'not', 'none', 'nobody', 'nothing',
                 'neither', 'nowhere', 'never', 'hardly', 'barely', 'scarcely']
    
    Art = Nega = 0
    for w in words:
        if w in articles:
            Art += 1
        elif w in negations:
            Nega += 1
    features['Articles'] = Art
    features['Negations'] = Nega

    # Count the noun, adjective, verb, adverb, pronoun, unique words, word count, review length and authenticity score

    Noun = Adj = Verb = Adv = Pro = 0
    tags = pos_tag(words)
    counts = Counter(tag for word, tag in tags)

    Noun += sum([counts[i] for i in counts.keys() if 'NN' in i])
    Adj += sum([counts[i] for i in counts.keys() if 'JJ' in i])
    Verb += sum([counts[i] for i in counts.keys() if 'VB' in i])
    Adv += sum([counts[i] for i in counts.keys() if 'RB' in i])
    Pro += sum([counts[i] for i in counts.keys() if (('PRP' in i) or ('PRP$' in i) or ('WP' in i) or ('WP$' in i))])
    unique_words_count = len(set(words))
    word_count = len(words)
    review_length = len(review)

    if word_count > 0:
        authenticity_score = (Pro + unique_words_count - neg_word_count) / word_count
    else:
        authenticity_score = 0 

    features['Authenticity'] = authenticity_score
    features['Noun'] = Noun
    features['Adjectives'] = Adj
    features['Verb'] = Verb
    features['Adverb'] = Adv
    features['Pronoun'] = Pro
    features['Word_Count'] = word_count
    features['Unique_words'] = unique_words_count
    features['Review_Length'] = review_length

    features['review_date'] = datetime.strptime(review_date, "%d-%b-%y")
    extracted_features.append(features)

    # print(review_timestamp)
features_data = pd.DataFrame(extracted_features)

# measure the similarity (TF-IDF and Cosine Similarity)

def calculate_similarity(matrix1, matrix2, chunk_size=1000):
    num_rows = matrix1.shape[0]
    similarity_scores = np.zeros(num_rows)
    
    for start_idx in range(0, num_rows, chunk_size):
        end_idx = min(start_idx + chunk_size, num_rows)
        chunk_similarity = cosine_similarity(matrix1[start_idx:end_idx], matrix2).max(axis=1)
        similarity_scores[start_idx:end_idx] = chunk_similarity
    
    return similarity_scores


tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

tfidf_reviews = tfidf_vectorizer.fit_transform(review_text)
tfidf_description = tfidf_vectorizer.transform(description)
tfidf_features = tfidf_vectorizer.transform(features)
tfidf_categories = tfidf_vectorizer.transform(Category)

features_data['similarity_with_description ']= calculate_similarity(tfidf_reviews, tfidf_description)
features_data['similarity_with_features'] = calculate_similarity(tfidf_reviews, tfidf_features)
features_data['similarity_with_category'] = calculate_similarity(tfidf_reviews, tfidf_categories)
features_data['Review_Text'] = review_text


print(features_data.head())

ValueError: time data '14-Oct-24' does not match format '%d-%b-%y %H:%M:%S'

In [6]:
output_file_path = "processed_data_with_sentiment.csv"
features_data.to_csv(output_file_path)

print("Feature extraction complete! Processed file saved as:", output_file_path)

Feature extraction complete! Processed file saved as: processed_data_with_sentiment.csv
