# updating all dataset files

In [1]:
import os
# import pymongo
import json
# from nltk.sentiment import SentimentIntensityAnalyzer
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import numpy as np
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Download nltk resources
nltk.download('vader_lexicon')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\GS\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\GS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\GS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\GS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
# Load pre-trained BERT model
bert_model = SentenceTransformer('bert-base-nli-mean-tokens')



### directory path

In [3]:
homePath = "C:/Hammad Aslam/BS IT (post ADP)/3rd Semester/Capstone Project/Project/backend/datasets/categories/allFiles"

### preprocessing and calculating the similarity

In [4]:
# Function to preprocess text
def preprocess_text(text):
    tokens = word_tokenize(text)
    tokens = [token.lower() for token in tokens if token.isalpha()]
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

# Function to calculate cosine similarity between two sentences
def calculate_similarity(sentence1, sentence2):
    if not isinstance(sentence1, str) or not isinstance(sentence2, str):
        return 0.0  # Return zero
    if len(sentence1) == 0 or len(sentence2) == 0:
        return 0.0  # Return zero similarity if any sentence is empty

    # Preprocess sentences
    preprocessed_sentence1 = preprocess_text(sentence1)
    preprocessed_sentence2 = preprocess_text(sentence2)

    # Encode sentences using BERT model
    embedding1 = bert_model.encode(preprocessed_sentence1)
    embedding2 = bert_model.encode(preprocessed_sentence2)

    # Calculate cosine similarity between embeddings
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
    return max(similarity_score, 0.0)

### updates the data with sentiment scores and helpfulness scores

In [8]:
# global count
def update_data(data):
    count = 0
    for i, product in enumerate(data):
        categories_str = ', '.join(product['category'])
        for j, review in enumerate(product['reviews']):
            # Convert list of categories to string
            features_str = review['review_topics']

            similarity_text_description = calculate_similarity(review['review_body'], product['product_description'])
            similarity_text_features = calculate_similarity(review['review_body'], features_str)

            # Ensure categories_str is a string
            if isinstance(categories_str, np.ndarray):
                categories_str = ', '.join(categories_str)

            # Split categories_str into individual categories and calculate similarity for each
            similarity_text_categories = 0.0
            categories = categories_str.split(', ')
            for category in categories:
                similarity_text_categories += calculate_similarity(review['review_body'], category)
            similarity_text_categories /= len(categories)  # Average similarity over all categories

            # similarity_text_details = calculate_similarity(review['review_body'], review['details'])
            sia = SentimentIntensityAnalyzer()
            sentiment_scores = sia.polarity_scores(review['review_body'] if isinstance(review['review_body'], str) else '')
            # Update the DataFrame with similarity scores
            review['similarity_text_description'] = similarity_text_description
            review['similarity_text_features'] = similarity_text_features
            review['similarity_text_categories'] = similarity_text_categories
            # review['similarity_text_details'] = similarity_text_details
            review['sentiment_scores'] = sentiment_scores['compound']
            
            # review['review_length'] = len(''.join(review['review_body'].split(' ')))
            # review['reviews_count'] = 1
            # review['avg_review_length'] = review['review_length']
            # review['avg_rating'] = review['review_rating']

            count += 1
            # print(f"{count}")
            # print(merged_df3.iloc[i])
    return data

### reads the files and punches the updated data in the file

In [7]:
def readFiles(file):
    with open(file, "r") as f:
        data = json.load(f)
    
    newData = update_data(data)
    def convert(obj):
        if isinstance(obj, np.float32):
            return float(obj)
        elif isinstance(obj, list):
            return [convert(item) for item in obj]
        elif isinstance(obj, dict):
            return {key: convert(value) for key, value in obj.items()}
        else:
            return obj

    newData_serializable = convert(newData)
    # newData_serializable = json.loads(json.dumps(newData, cls=np.array))
    # print(newData)
    with open(file, 'w') as json_file:
        json.dump(newData_serializable, json_file, indent=4)


### accessing each file exactly once in the directory

In [11]:
import csv
import json
import os

def jsonToCsv(attributes, output_file):
    count = 0
    # homePath = 'D:/BS-IT/4th semester/Capstone Project II/opinio/reviews-classifier/backend/datasets/categories/allFiles'
    with open(output_file, 'w', newline='') as csvfile:
        # len(data[28]['reviews'])
        for filename in os.listdir(homePath):
            if filename.endswith(".json"):
                file = homePath + "/" + filename
                with open(file, "r") as f:
                    data = json.load(f)
                # readFiles(file)
                print(f"{count} done")
                for i, product in enumerate(data):
                    # for j, review in enumerate(data[28]['reviews']):
                    csv_writer = csv.writer(csvfile)
                    # Write the header row with selected attributes
                    csv_writer.writerow(attributes)
                    # Loop through each JSON object (if it's an array)
                    if isinstance(product['reviews'], list):
                        for item in product['reviews']:
                            # if int(item['review_votes']) == 0:
                            # Extract values for selected attributes
                            row = [item.get(attr) for attr in attributes]
                            csv_writer.writerow(row)
                            count += 1
                    else:
                        # if int(item['review_votes']) == 0:
                        # Handle single JSON object
                        row = [product['reviews'].get(attr) for attr in attributes]
                        csv_writer.writerow(row)
                        count += 1

attributes_to_include = ["review_body", "review_topics", "review_votes", "sentiment", "review_helpfulness"]
output_filename = "C:/Hammad Aslam/BS IT (post ADP)/3rd Semester/Capstone Project/Project/backend/practice/graph/no_votes.csv"

jsonToCsv(attributes_to_include, output_filename)

print('done')

0 done
52 done
1790 done
2282 done
5510 done
6439 done
8571 done
10532 done
12253 done
12673 done
13389 done
14580 done
done


In [9]:
count1 = 0
for filename in os.listdir(homePath):
    if filename.endswith(".json"):
        file = homePath + "/" + filename
        if filename != "computers_laptops.json":
            readFiles(file)
            count1 += 1
            print(f"{count1} done")

1 done
2 done
3 done
4 done
5 done
6 done
7 done
8 done
9 done
10 done
11 done
