In [1]:
import os
import json
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import sentiwordnet as swn
from collections import Counter
from nltk import pos_tag
from sklearn.utils import shuffle
import string
import nltk
import pandas as pd
from textstat import flesch_reading_ease
import editdistance
nltk.download('sentiwordnet')

[nltk_data] Downloading package sentiwordnet to
[nltk_data]     C:\Users\jesse\AppData\Roaming\nltk_data...
[nltk_data]   Package sentiwordnet is already up-to-date!


True

In [6]:
# Set the working directory
os.chdir('C:/Users/jesse/OneDrive/Documenten/Thesis/amazon_code/dataframes_done')

In [7]:
# Load the reviews from the JSON file
with open('output.json', 'r') as f:
    reviews = json.load(f)

# DataFrame
reviews_df = pd.DataFrame(reviews)

In [22]:
# Quantity
result_quantity = []

for review in reviews:
    text = review['Review_Text']
    # Number of words
    words = text.split()
    num_words = len(words)
    # Number of sentences
    sentences = text.split('.')
    num_sentences = len(sentences)
    # Number of modifiers
    modifiers = ['very', 'extremely', 'quite', 'somewhat', 'really']
    num_modifiers = sum(text.lower().count(modifier) for modifier in modifiers)
    # Number of caps
    num_caps = sum(1 for c in text if c.isupper())
    # Number of punctuation
    num_punctuation = sum(text.count(p) for p in string.punctuation)
    # Part of speech 
    pos_tags = nltk.pos_tag(words)
    noun_count = len([word for word, pos in pos_tags if pos in ['NN', 'NNS', 'NNP', 'NNPS']])
    verb_count = len([word for word, pos in pos_tags if pos in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']])
    adj_count = len([word for word, pos in pos_tags if pos in ['JJ', 'JJR', 'JJS']])
    adv_count = len([word for word, pos in pos_tags if pos in ['RB', 'RBR', 'RBS']])

    # linguistic features results
    result_quantity.append({
        'Number_of_words': num_words,
        'Number_of_sentences': num_sentences,
        'Number_of_modifiers': num_modifiers,
        'Number_of_caps': num_caps,
        'Number_of_punctuation': num_punctuation,
        'Number_of_nouns': noun_count,
        'Number_of_verbs': verb_count,
        'Number_of_adjectives': adj_count,
        'Number_of_adverbs': adv_count
    })

# Result dataframe
df_qua_senti = pd.DataFrame(result_quantity)

# Dataframe to Json file
df_qua_senti.to_json('df_qua_senti.json', orient='records')

In [8]:
# Complexity

# Redundancy function
def calculate_redundancy(text):
    words = text.split()
    
    # Sum of Levenshtein distances between all pairs of words
    total_distance = sum(editdistance.eval(w1, w2) for i, w1 in enumerate(words) for j, w2 in enumerate(words) if i < j)
    
    # Average Levenshtein distance
    n = len(words)
    if n > 1:
        average_distance = total_distance / (n * (n - 1) / 2)
    else:
        average_distance = 0
    
    # Return the redundancy
    return 1 - average_distance / len(max(words, key=len))

results_complexity = []

for review in reviews:
    text = review['Review_Text']
    # Number of words
    words = text.split()
    num_words = len(words)
    # Number of sentences
    sentences = sent_tokenize(text)
    num_sentences = len(sentences)
    # Average word length
    total_word_length = sum(len(word) for word in words)
    avg_word_length = total_word_length / num_words
    # Average sentence length
    total_sentence_length = sum(len(sent) for sent in sentences)
    avg_sentence_length = total_sentence_length / num_sentences
    # Redundance score
    redundancy = calculate_redundancy(text)
    # Readability score
    readability_score = flesch_reading_ease(text)

    # linguistic features results
    results_complexity.append({
        'Average_word_length': avg_word_length,
        'Average_sentence_length': avg_sentence_length,
        'Redundancy_score': redundancy,
        'Readability_score': readability_score,
})

# Result dataframe
df_com_senti = pd.DataFrame(results_complexity)

# Dataframe to Json file
df_com_senti.to_json('df_com_senti.json', orient='records')

In [71]:
# Diversity
results_diversity = []

for review in reviews:
    text = review['Review_Text']
    words = text.split()
    num_words = len(words)
    # Lexical diversity
    unique_words = set(words)
    lexical_diversity = len(unique_words) / num_words

    # Linguistic features results
    results_diversity.append({
        'Lexical_diversity': lexical_diversity
    })

# Results dataframe
df_div_senti = pd.DataFrame(results_diversity)

# Dataframe to Json file
df_div_senti.to_json('df_div_senti.json', orient='records')

In [9]:
# Emotion

# Sentiment score function per word
def get_sentiment(word):
    sentiment = 0.0
    synsets = nltk.corpus.wordnet.synsets(word)
    if not synsets:
        return sentiment
    synset = synsets[0]
    swn_synset = swn.senti_synset(synset.name())
    sentiment = swn_synset.pos_score() - swn_synset.neg_score()
    return sentiment

# Sentiment score function per sentence
def get_sentiment_score(sentence):
    words = nltk.word_tokenize(sentence)
    sentiment_score = sum(get_sentiment(word) for word in words)
    return sentiment_score

# Intensity score function
def get_intensity_score(sentence):
    words = nltk.word_tokenize(sentence)
    intensity_score = sum(get_sentiment(word) ** 2 for word in words)
    return intensity_score

# Ambiguity score function
def get_ambiguity_score(sentence):
    words = nltk.word_tokenize(sentence)
    ambiguity_score = sum(1 for word in words if len(nltk.corpus.wordnet.synsets(word)) > 1)
    return ambiguity_score

# Total number of positive, negative, and objective words in a sentence
def count_words(sentence):
    words = nltk.word_tokenize(sentence)
    pos_words = 0
    neg_words = 0
    obj_words = 0
    for word in words:
        sentiment = get_sentiment(word)
        if sentiment > 0:
            pos_words += 1
        elif sentiment < 0:
            neg_words += 1
        else:
            obj_words += 1
    return pos_words, neg_words, obj_words

# Main function
def analyze_review(reviews):
    sentiment_score = get_sentiment_score(reviews['Review_Text'])
    intensity_score = get_intensity_score(reviews['Review_Text'])
    ambiguity_score = get_ambiguity_score(reviews['Review_Text'])
    pos_words, neg_words, obj_words = count_words(reviews['Review_Text'])
    return sentiment_score, intensity_score, ambiguity_score, pos_words, neg_words, obj_words

results_emotion = []

for review in reviews:
    sentiment_score, intensity_score, ambiguity_score, pos_words, neg_words, obj_words = analyze_review(review)
    results_emotion.append({
        'Sentiment_score': sentiment_score,
        'Review_intensity': intensity_score,
        'Review_ambiguity': ambiguity_score,
        'Number_of_positive words:': pos_words,
        'Number_of_negative words:': neg_words,
        'Number_of_objective words:': obj_words,
        })

# Results dataframe
df_emo_senti = pd.DataFrame(results_emotion)

# Dataframe to Json file
df_emo_senti.to_json('df_emo_senti.json', orient='records')

In [10]:
# Emotion v2

# Sentiment score per word function
def calculate_sentiment_score(word):
    synsets = list(swn.senti_synsets(word))
    if not synsets:
        return 0, 0
    synset = synsets[0]
    return synset.pos_score(), synset.neg_score()

# Iterate through reviews and calculate
results_emotion_2 = []
for review in reviews:
    text = review['Review_Text']
    pos_125, neg_125, pos_25, neg_25, pos_375, neg_375, pos_5, neg_5, pos_625, neg_625, pos_75, neg_75, pos_875, neg_875, pos_1, neg_1 = 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
    for word in text.split():
        pos_score, neg_score = calculate_sentiment_score(word)
        if pos_score == 0 and neg_score == 0:
            continue
        if pos_score == 0.125:
            pos_125 += 1
        elif pos_score == 0.25:
            pos_25 += 1
        elif pos_score == 0.375:
            pos_375 += 1
        elif pos_score == 0.5:
            pos_5 += 1
        elif pos_score == 0.625:
            pos_625 += 1
        elif pos_score == 0.75:
            pos_75 += 1
        elif pos_score == 0.875:
            pos_875 += 1
        elif pos_score == 0.1:
            pos_1 += 1
        if neg_score == 0.125:
            neg_125 += 1
        elif neg_score == 0.25:
            neg_25 += 1
        elif neg_score == 0.375:
            neg_375 += 1
        elif neg_score == 0.5:
            neg_5 += 1
        elif neg_score == 0.625:
            neg_625 += 1
        elif neg_score == 0.75:
            neg_75 += 1
        elif neg_score == 0.875:
            neg_875 += 1
        elif neg_score == 1:
            neg_1 += 1
            
    results_emotion_2.append({'pos_125': pos_125, 'neg_125': neg_125, 'pos_25': pos_25, 'neg_25': neg_25, 'pos_375': pos_375, 'neg_375': neg_375, 'pos_5': pos_5, 'neg_5': neg_5, 'pos_625': pos_625, 'neg_625': neg_625, 'pos_75': pos_75, 'neg_75': neg_75, 'pos_875': pos_875, 'neg_875': neg_875, 'pos_1': pos_1, 'neg_1': neg_1})

# Results dataframe
df_emo_2_senti = pd.DataFrame(results_emotion_2)

# Dataframe to Json file
df_emo_2_senti.to_json('df_emo_2_senti.json', orient='records')

In [6]:
# Set the working directory
os.chdir('C:/Users/jesse/OneDrive/Documenten/Thesis/amazon_code/dataframes')

# Load dataframes
df_qua_senti = pd.read_json('df_qua_senti.json')
df_com_senti = pd.read_json('df_com_senti.json')
df_div_senti = pd.read_json('df_div_senti.json')
df_emo_senti = pd.read_json('df_emo_senti.json')
df_emo_2_senti = pd.read_json('df_emo_2_senti.json')

# Final Dataframe
senti_df = pd.concat([reviews_df['Label'], df_qua_senti, df_com_senti, df_div_senti, df_emo_senti, df_emo_2_senti], axis=1)

# Dataframe to a Json file
senti_df.to_json('senti_df.json', orient='records')

Unnamed: 0,Label,Sentiment_score,Review_intensity,Review_ambiguity,Number_of_positive words:,Number_of_negative words:,Number_of_objective words:,pos_125,neg_125,pos_25,...,pos_5,neg_5,pos_625,neg_625,pos_75,neg_75,pos_875,neg_875,pos_1,neg_1
0,__label1__,1.375,0.796875,15,3,0,24,2,1,0,...,0,0,2,0,0,0,0,0,0,0
1,__label1__,1.0,1.03125,52,9,3,68,3,3,3,...,0,0,0,1,0,0,0,0,0,0
2,__label1__,4.25,2.375,38,10,0,56,1,6,3,...,3,0,1,0,1,0,1,0,0,0
3,__label1__,-0.75,0.625,30,5,5,41,2,3,2,...,0,0,0,1,0,0,0,0,0,0
4,__label1__,0.0,0.65625,44,2,1,74,1,0,0,...,1,0,0,1,0,0,0,0,0,0


In [75]:
# Rename  labels
senti_df['Label'] = senti_df['Label'].replace({'__label1__': 'fake', '__label2__': 'real'})

# Group data by label and calculate statistics
statistics_senti = senti_df.groupby('Label').describe().transpose()

# Print statistics
print(statistics_senti)

statistics_senti.to_excel('statistics_senti.xlsx')

Label                 fake          real
DOC_ID count  10500.000000  10500.000000
       mean    5250.500000  15750.500000
       std     3031.233247   3031.233247
       min        1.000000  10501.000000
       25%     2625.750000  13125.750000
...                    ...           ...
neg_1  min        0.000000      0.000000
       25%        0.000000      0.000000
       50%        0.000000      0.000000
       75%        0.000000      0.000000
       max        3.000000      2.000000

[296 rows x 2 columns]


In [6]:
from sklearn.preprocessing import MinMaxScaler

# Select the columns to normalize
columns_to_normalize = ['Number_of_words', 'Number_of_sentences', 'Number_of_caps',
       'Number_of_punctuation', 'Number_of_nouns', 'Number_of_verbs',
       'Number_of_adjectives', 'Number_of_adverbs', 'Average_word_length',
       'Average_sentence_length', 'Redundancy_score',
       'Lexical_diversity', 'Review_intensity',
       'Review_ambiguity', 'Number_of_positive words:',
       'Number_of_negative words:', 'Number_of_objective words:', 'pos_125',
       'neg_125', 'pos_25', 'neg_25', 'pos_375', 'neg_375', 'pos_5', 'neg_5',
       'pos_625', 'neg_625', 'pos_75', 'neg_75', 'pos_875', 'neg_875', 'pos_1',
       'neg_1']

scaler = MinMaxScaler()
# Normalize 
senti_df[columns_to_normalize] = scaler.fit_transform(senti_df[columns_to_normalize])

# Dataframe to Json file
senti_df.to_json('senti_df_norm.json', orient='records')
print(senti_df.head())

   Label  Number_of_words  Number_of_sentences  Number_of_caps  \
0      0         0.002822             0.010152        0.001144   
1      0         0.020106             0.015228        0.002859   
2      0         0.014109             0.030457        0.003431   
3      0         0.010582             0.020305        0.004002   
4      0         0.019400             0.015228        0.002859   

   Number_of_punctuation  Number_of_nouns  Number_of_verbs  \
0               0.005357         0.005908         0.009615   
1               0.012500         0.026588         0.025000   
2               0.017857         0.013294         0.023077   
3               0.008929         0.013294         0.019231   
4               0.010714         0.023634         0.019231   

   Number_of_adjectives  Number_of_adverbs  Average_word_length  ...  \
0              0.007663           0.009174             0.123009  ...   
1              0.038314           0.022936             0.200208  ...   
2             