In [4]:
import pandas as pd
import contractions
import re
from nltk.corpus import stopwords
from textblob import TextBlob
from nltk.tokenize import sent_tokenize
from nltk import pos_tag
from collections import Counter
import nltk

In [None]:
# Download necessary NLTK data
nltk.download("popular")
stop_words = set(stopwords.words('english'))

In [6]:
# Preprocessing Function
def preprocess_text(review):
    # Expand contractions
    review = contractions.fix(review)
    
    # Remove non-word characters and digits
    review = re.sub(r'\W', ' ', review)
    review = re.sub(r'\d', ' ', review)
    
    # Convert text to lowercase
    review = review.lower()
    
    # Remove stopwords
    words = review.split()
    filtered_words = [word for word in words if word not in stop_words]
    
    # Join back to a single string
    cleaned_text = ', '.join(filtered_words)
    return cleaned_text


In [7]:
# Feature Extraction Function
def extract_features(cleaned_review):
    # Sentiment Analysis
    testimonial = TextBlob(cleaned_review)
    Sentiment = testimonial.sentiment.polarity
    Subjectivity = testimonial.sentiment.subjectivity
    
    # Negative Word Count
    neg = 0
    words = cleaned_review.split()
    for w in words:
        testimonial = TextBlob(w)
        score = testimonial.sentiment.polarity
        if score < 0:
            neg += 1
     # Word Statistics
    word_count = len(words)
    unique_words = len(set(words))
    
    # Part-of-Speech (POS) Tagging and Counts
    tokenized = sent_tokenize(cleaned_review)
    Noun = Adj = Verb = Adv = Pro = Pre = Con = Art = Aux = 0
    for sentence in tokenized:
        words_list = nltk.word_tokenize(sentence)
        tagged = pos_tag(words_list)
        counts = Counter(tag for word, tag in tagged)
        Noun += sum([counts[i] for i in counts.keys() if 'NN' in i])
        Adj += sum([counts[i] for i in counts.keys() if 'JJ' in i])
        Verb += sum([counts[i] for i in counts.keys() if 'VB' in i])
        Adv += sum([counts[i] for i in counts.keys() if 'RB' in i])
    
    # Custom Metrics
    authenticity = (unique_words - neg) / word_count
    data_stat = {
        'Sentiment': Sentiment,
        'Subjectivity': Subjectivity,
        'Negative_Count': neg,
        'Word_Count': word_count,
        'Unique_Words': unique_words,
        'Noun_Count': Noun,
        'Adjective_Count': Adj,
        'Verb_Count': Verb,
        'Adverb_Count': Adv,
        'Authenticity': authenticity
    }
    return data_stat

In [14]:
# File paths
input_file = r"E:\Post ADP\Capstone Project\Scrapping from Amazon\datasets\Cleaned_data.csv"
output_file = r"E:\Post ADP\Capstone Project\Scrapping from Amazon\datasets\Processed_data.csv"

# Read the dataset
df = pd.read_csv(input_file, encoding='latin1')

# Create a list to store the processed data
processed_data = []

# Specify the columns to process
columns_to_process = ['Review Text', 'productTitle', 'Description', 'Features', 'Review Title', 'Category']

# Process each review in the dataset
for index, row in df.iterrows():
    combined_review = " ".join([str(row[col]) for col in columns_to_process if col in df.columns])
    cleaned_review = preprocess_text(combined_review)
    features = extract_features(cleaned_review)
    features['Original_Review'] = combined_review
    features['Cleaned_Review'] = cleaned_review
    processed_data.append(features)

# Convert processed data into a DataFrame
processed_df = pd.DataFrame(processed_data)

# Save the processed DataFrame to a new CSV file
processed_df.to_csv(output_file, index=False)

print(f"Processed data saved to {output_file}")

Processed data saved to E:\Post ADP\Capstone Project\Scrapping from Amazon\datasets\Processed_data.csv


In [2]:
data = {
    "Review Title": "Good quality",
    "Review Text": "The charger is good. However, it should be narrow from the front so that it can fit in the power sockets."
}

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

# Combine the text data (Review Title + Review Text)
text_data = [
    data["Review Title"],
    data["Review Text"]
]

# Initialize the Bag of Words vectorizer
vectorizer = CountVectorizer()

# Fit the vectorizer and transform the text into a Bag of Words representation
bow_features = vectorizer.fit_transform(text_data)

# Convert to array for readability
bow_array = bow_features.toarray()

# Get the feature names (words in the vocabulary)
vocabulary = vectorizer.get_feature_names_out()

# Print the results
print("Vocabulary:", vocabulary)
print("Bag of Words Representation:")
print(bow_array)


Vocabulary: ['be' 'can' 'charger' 'fit' 'from' 'front' 'good' 'however' 'in' 'is' 'it'
 'narrow' 'power' 'quality' 'should' 'so' 'sockets' 'that' 'the']
Bag of Words Representation:
[[0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 2 1 1 0 1 1 1 1 3]]


In [None]:
import nltk
from nltk import pos_tag
from nltk.corpus import opinion_lexicon
from textblob import TextBlob

# Download required NLTK resources
nltk.download('opinion_lexicon')
nltk.download('averaged_perceptron_tagger')

# Read the cleaned reviews from the file
with open("E:\Post ADP\Capstone Project\Scrapping from Amazon\datasets\Cleaned_data_tokenized.json", "r") as file:
    reviews = [line.strip() for line in file.readlines()]

# Define helper function to extract features
def extract_features_from_tokens(tokens):
    # Part-of-speech tagging
    pos_tags = pos_tag(tokens)
    
    # Calculate features
    num_unique_words = len(set(tokens))  # Number of unique words
    positive_words = sum(1 for word in tokens if word in opinion_lexicon.positive())  # Positive words
    negative_words = sum(1 for word in tokens if word in opinion_lexicon.negative())  # Negative words
    num_nouns = sum(1 for word, pos in pos_tags if pos.startswith('NN'))  # Nouns
    num_adjectives = sum(1 for word, pos in pos_tags if pos.startswith('JJ'))  # Adjectives
    num_adverbs = sum(1 for word, pos in pos_tags if pos.startswith('RB'))  # Adverbs
    num_verbs = sum(1 for word, pos in pos_tags if pos.startswith('VB'))  # Verbs
    review_length = len(tokens)  # Length of the review
    subjectivity = TextBlob(" ".join(tokens)).sentiment.subjectivity  # Subjectivity score (0-1)
    authenticity = positive_words - negative_words  # Simplified authenticity score
    
    # Return feature dictionary
    return {
        "Unique Words": num_unique_words,
        "Positive Words": positive_words,
        "Negative Words": negative_words,
        "Nouns": num_nouns,
        "Adjectives": num_adjectives,
        "Adverbs": num_adverbs,
        "Verbs": num_verbs,
        "Review Length": review_length,
        "Subjectivity": subjectivity,
        "Authenticity": authenticity
    }

# Process each review
features_list = []
for review in reviews:
    tokens = review.split()  # Split the cleaned review into tokens
    features = extract_features_from_tokens(tokens)
    features_list.append(features)

# Save features to a new file
with open("review_features.json", "w") as file:
    for idx, features in enumerate(features_list, start=1):
        file.write(f"Review {idx}: {features}\n")

# Print features
for idx, features in enumerate(features_list, start=1):
    print(f"Review {idx}: {features}")
