In [1]:
import pandas as pd
import json
import datetime
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
import shutil #to get shell info
import sys #system interaction

In [2]:
# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kimon\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kimon\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
def clearLine():
    columns, _ = shutil.get_terminal_size(fallback=(80,20)) #get the terminal size retaining just the column count, ignoring the rows
    sys.stdout.write('\r' + ' ' * columns * 2 + '\r') #reset to the beginning of the line and print enough spaces to clear out the line. We're multiplying by 2 to cover the line
    sys.stdout.flush() #flush the buffer

def logMessage(msg: str):
    clearLine() #clear the whole line
    ts = datetime.datetime.now() #get the current timestamp
    print(f"\r{ts}\t{msg}",end='\r',flush=True) #print the message

In [4]:
def preprocess_text_no_sentences(text):
    """
    Preprocess a single review text.
    - Lowercasing
    - Removing URLs
    - Removing special characters and punctuation
    - Tokenizing sentences and words
    #- Removing stopwords
    - Re-joining words
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Remove special characters and punctuation
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    
    # Tokenize sentences
    sentences = sent_tokenize(text)
    
    # Initialize NLTK's list of stopwords
    stop_words = set(stopwords.words('english'))
    
    # Process each sentence
    processed_sentences = []
    for sentence in sentences:
        # Tokenize words
        words = word_tokenize(sentence)
        # Remove stopwords and non-alphabetic words
        filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
        processed_sentences.append(' '.join(filtered_words))
    
    # Re-join sentences
    return ' '.join(processed_sentences)

In [5]:
def preprocess_text_sentences(text):
    """
    Preprocess text data at the sentence level.
    - Lowercasing
    - Removing URLs
    - Removing special characters and punctuation
    - Tokenizing sentences and words
    #- Removing stopwords
    - Keeping sentences separate
    """
    # Convert to lowercase
    text = text.lower()
    
    # Remove URLs
    text = re.sub(r'http\S+|www.\S+', '', text)
    
    # Tokenize into sentences
    sentences = sent_tokenize(text)
    
    ## Initialize NLTK's list of stopwords
    #stop_words = set(stopwords.words('english'))
    
    processed_sentences = []
    for sentence in sentences:
        # Remove special characters and punctuation
        sentence = re.sub(f'[{re.escape(string.punctuation)}]', '', sentence)
        
        # Tokenize words
        words = word_tokenize(sentence)
        
        # Remove stopwords and non-alphabetic words
        filtered_words = [word for word in words if word.isalpha() and word not in stop_words]
        
        # Check if sentence is not empty after preprocessing
        if filtered_words:
            processed_sentences.append(' '.join(filtered_words))
    
    return processed_sentences

In [6]:
def remove_emojis(text):
    """
    Remove emojis from the text by targeting specific Unicode blocks.
    """
    # Unicode ranges for emojis
    emoji_pattern = re.compile(
        "["
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    
    return emoji_pattern.sub(r'', text)

In [7]:
def rating_to_sentiment(rating, max_rating):
    rtg = float(rating)
    if max_rating == 5.0:
        match rtg:
            case rtg if 0.0 <= rtg < 2.5:
                return 'negative'
            case rtg if 2.5 <= rtg <= 3.5:
                return 'neutral'
            case rtg if 3.5 < rtg <= 5.0:
                return 'positive'
            case _:
                return 'error'
    if max_rating == 10.0:
        match rtg:
            case rtg if 0.0 <= rtg <= 4.0:
                return 'negative'
            case rtg if 4.0 < rtg < 6.0:
                return 'neutral'
            case rtg if 6.0 <= rtg <= 10.0:
                return 'positive'
            case _:
                return 'error'
    
    return 'error'

In [None]:
f1=r".\\trusted_pilot_car_dealerships-20240207.json"
f2=r".\\boardgamegeek-20240211.json"
dfTP = pd.read_json(f1)
dfBG = pd.read_json(f2)
dfTP['review_count'] = dfTP['reviews'].apply(lambda x: len(x))
dfBG['review_count'] = dfBG['reviews'].apply(lambda x: len(x))

tpProds = len(dfTP)
tpRevs = len(dfBG)

print(f"TrustPilot\tBusinesses: {tpProds}\tTotal reviews: {dfTP['review_count'].sum()}")
print(f"BoardGameGeek\tGames: {tpRevs}\tTotal reviews: {dfBG['review_count'].sum()}")



TrustPilot	Businesses: 93	Total reviews: 225020
BoardGameGeek	Games: 250	Total reviews: 903916


In [9]:
tpProcessed = []
for index, row in dfTP.iterrows():
    nm = row['business_name']
    rc = row['review_count']
    rvs = row['reviews']
    rvws = []
    positives = 0
    neutrals = 0
    negatives = 0
    errors = 0

    print(f"Processing {nm} with {rc} reviews")
    for rv in rvs:
        rvttl = rv['review_title']
        rvtxt = rv['review_text']
        rvrtg = rv['rating']
        rvsent = rating_to_sentiment(rvrtg, 5.0)
        match rvsent:
            case 'negative':
                negatives += 1
            case 'neutral':
                neutrals += 1
            case 'positive':
                positives += 1
            case _:
                errors += 1
                rvsent = ''

        if not(rvtxt and len(rvtxt)>0):
            rvtxt = rvttl
        
        if rvtxt and len(rvtxt)>0:
            txt_no_sntc = preprocess_text_no_sentences(rvtxt)
            txt_no_sntc_no_emoji = remove_emojis(txt_no_sntc)
            txt_sntc = preprocess_text_sentences(rvtxt)
            txt_sntc_no_emoji= []
            for s in txt_sntc:
                sne = remove_emojis(s)
                txt_sntc_no_emoji.append(sne)
            rvws.append({'review_rating': rvrtg, 'review_title':rvttl, 'review_text_raw': rvtxt, 'review_sentiment': rvsent, 'review_text_preprocessed': txt_no_sntc, 'review_text_preprocessed_noemoji': txt_no_sntc_no_emoji, 'review_text_preprocessed_sentence': txt_sntc, 'review_text_preprocessed_sentence_noemoji': txt_sntc_no_emoji})
            logMessage(f"Processing {len(rvws)} of {rc}. Rating of {rvrtg} translated to {rvsent}")
        
    clearLine()
    tpProcessed.append({'name': nm, 'review_count':rc, 'negative_reviews': negatives, 'neutral_reviews': neutrals, 'positive_reviews': positives, 'reviews': rvws})



Processing Impex Auto Sales with 1 reviews
Processing Beep Auto with 1 reviews                                                                                                                             
Processing Len Lyall Chevrolet  with 1 reviews                                                                                                                  
Processing Princeton Porsche with 1 reviews                                                                                                                     
Processing CAS Auto LLC with 1 reviews                                                                                                                          
Processing eCarsCash with 1 reviews                                                                                                                             
Processing Cash For Cars - We Come To You LLC with 1 reviews                                                                                            

In [10]:
print(len(tpProcessed))

93


In [11]:
maxlen = 0
for r in tpProcessed:
    if len(r['name']) > maxlen:
        maxlen=len(r['name'])
        
for r in tpProcessed:
    spaces = ' ' * (maxlen - len(r['name']))
    print(f"name: {r['name']}{spaces}\treviews: {r['review_count']}\tnegatives: {r['negative_reviews']}\tneutrals: {r['neutral_reviews']}\tpositives: {r['positive_reviews']}")

name: Impex Auto Sales                        	reviews: 1	negatives: 1	neutrals: 0	positives: 0
name: Beep Auto                               	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: Len Lyall Chevrolet                     	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: Princeton Porsche                       	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: CAS Auto LLC                            	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: eCarsCash                               	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: Cash For Cars - We Come To You LLC      	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: Kelly Nissan of Lynnfield               	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: Columbine Ford                          	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: Auto Mart                               	reviews: 1	negatives: 0	neutrals: 0	positives: 1
name: Southern Auto Mart                

In [12]:
filename = f".\\trust_pilot_car_dealership_preprocessed-{datetime.datetime.now().strftime('%Y%m%d')}.json"
with open(filename, 'w', encoding='utf-8', newline='\n') as file:
    json.dump(tpProcessed, file, indent=4, ensure_ascii=False)

In [13]:
tpProcessed = []
dfTP = []

In [14]:
bggProcessed = []
for index, row in dfBG.iterrows():
    nm = row['game_name'][0]
    rc = row['review_count']
    rvs = row['reviews']
    rvws = []
    positives = 0
    neutrals = 0
    negatives = 0
    errors = 0

    print(f"Processing {nm} with {rc} reviews")
    for rv in rvs:
        rvttl = ''
        rvtxt = rv['review_text']
        rvrtg = rv['rating']
        rvsent = rating_to_sentiment(rvrtg, 10.0)
        match rvsent:
            case 'negative':
                negatives += 1
            case 'neutral':
                neutrals += 1
            case 'positive':
                positives += 1
            case _:
                errors += 1
                rvsent = ''
        
        if rvtxt and len(rvtxt)>0:
            txt_no_sntc = preprocess_text_no_sentences(rvtxt)
            txt_no_sntc_no_emoji = remove_emojis(txt_no_sntc)
            txt_sntc = preprocess_text_sentences(rvtxt)
            txt_sntc_no_emoji= []
            for s in txt_sntc:
                sne = remove_emojis(s)
                txt_sntc_no_emoji.append(sne)
            rvws.append({'review_rating': rvrtg, 'review_title':rvttl, 'review_text_raw': rvtxt, 'review_sentiment': rvsent, 'review_text_preprocessed': txt_no_sntc, 'review_text_preprocessed_noemoji': txt_no_sntc_no_emoji, 'review_text_preprocessed_sentence': txt_sntc, 'review_text_preprocessed_sentence_noemoji': txt_sntc_no_emoji})
            logMessage(f"Processing {len(rvws)} of {rc}. Rating of {rvrtg} translated to {rvsent}")
        
    clearLine()
    bggProcessed.append({'name': nm, 'review_count':rc, 'negative_reviews': negatives, 'neutral_reviews': neutrals, 'positive_reviews': positives, 'reviews': rvws})



Processing CATAN with 15659 reviews
Processing Carcassonne with 15182 reviews                                                                                                                       
Processing Pandemic with 13446 reviews                                                                                                                          
Processing 7 Wonders with 11134 reviews                                                                                                                         
Processing Terraforming Mars with 8799 reviews                                                                                                                  
Processing 7 Wonders Duel with 7478 reviews                                                                                                                     
Processing Dominion with 10895 reviews                                                                                                                         

In [15]:

print(len(bggProcessed))


250


In [16]:
maxlen = 0
for r in bggProcessed:
    if len(r['name']) > maxlen:
        maxlen=len(r['name'])
        
for r in bggProcessed:
    spaces = ' ' * (maxlen - len(r['name']))
    print(f"name: {r['name']}{spaces}\treviews: {r['review_count']}\tnegatives: {r['negative_reviews']}\tneutrals: {r['neutral_reviews']}\tpositives: {r['positive_reviews']}")

name: CATAN                                                                 	reviews: 15659	negatives: 1204	neutrals: 1360	positives: 13095
name: Carcassonne                                                           	reviews: 15182	negatives: 517	neutrals: 793	positives: 13872
name: Pandemic                                                              	reviews: 13446	negatives: 557	neutrals: 660	positives: 12229
name: 7 Wonders                                                             	reviews: 11134	negatives: 413	neutrals: 494	positives: 10227
name: Terraforming Mars                                                     	reviews: 8799	negatives: 468	neutrals: 346	positives: 7985
name: 7 Wonders Duel                                                        	reviews: 7478	negatives: 186	neutrals: 200	positives: 7092
name: Dominion                                                              	reviews: 10895	negatives: 551	neutrals: 635	positives: 9709
name: Wingspan                       

In [17]:
filename = f".\\boardgamegeek_preprocessed-{datetime.datetime.now().strftime('%Y%m%d')}.json"
with open(filename, 'w', encoding='utf-8', newline='\n') as file:
    json.dump(bggProcessed, file, indent=4, ensure_ascii=False)

In [None]:
bggProcessed = []
dfBG = []