In [9]:
# Import libraries

import os
os.chdir("D:/amazon-reviews") # Run always on the root to avoid problems

from importlib import reload
import pyarrow.parquet as pq
from pathlib import Path 
import spacy
import gc
import src.text.combine_columns as cc
import src.text.spacy_process as sp
import src.pipeline.spacy_for_embedding as sfe
from src.pipeline import merge_parquet
reload(cc)
reload(sfe)


<module 'src.pipeline.spacy_for_embedding' from 'D:\\amazon-reviews\\src\\pipeline\\spacy_for_embedding.py'>

In [2]:
gc.collect()

317

In [3]:
# Step 1 - Check final file

path = Path("data/processed/dataset_embedding.parquet")
# Just read metadata to avoid memory issues
parquet_file = pq.ParquetFile(path)

print("File found!")

info = parquet_file.scan_contents()

# Check first five rows using read row group (index of the row group, columns to read)
sample = parquet_file.read_row_group(0, columns=None)

# Need to convert to pandas because .head() is not implemented for Arrow tables
df = sample.to_pandas().head(5)


print(f"Number of rows: {info}")
print("Columns detected:", sample.column_names)
print("\nFirst five rows:")
df


File found!
Number of rows: 44235771
Columns detected: ['clean_review', 'clean_summary', 'reviewText', 'asin', 'overall', 'unixReviewTime', 'source', '__index_level_0__']

First five rows:


Unnamed: 0,clean_review,clean_summary,reviewText,asin,overall,unixReviewTime,source
0,this was the first time i read garcia-aguilera...,hit the spot,This was the first time I read Garcia-Aguilera...,60009810,5,1026864000,Electronics.json
1,as with all of ms garcia-aguilera's books i th...,one hot summer is hot hot hot,"As with all of Ms. Garcia-Aguilera's books, I ...",60009810,5,1025913600,Electronics.json
2,i've not read any of ms aguilera's works befor...,one hot summer,I've not read any of Ms Aguilera's works befor...,60009810,5,1025654400,Electronics.json
3,this romance novel is right up there with the ...,i love this book,This romance novel is right up there with the ...,60009810,4,1025395200,Electronics.json
4,carolina garcia aguilera has done it again she...,one hot book,Carolina Garcia Aguilera has done it again. S...,60009810,5,1025222400,Electronics.json


In [4]:
# Step 2 - Try Spacy

# Using spacy efficiency mode (sm according to documentation)
# Excluding pipeline components not needed for embeddings based on documentation
nlp = spacy.load("en_core_web_sm", disable=["ner", "parser", "textcat"])
print("SpaCy loaded successfully.")


# Just read the columns that are needed for embeddings
sample_2 = parquet_file.read_row_group(0, columns=["clean_summary", "clean_review"])
df_2 = sample_2.to_pandas().head(5)

print("\nSpacy document loading:")
df_2


SpaCy loaded successfully.

Spacy document loading:


Unnamed: 0,clean_summary,clean_review
0,hit the spot,this was the first time i read garcia-aguilera...
1,one hot summer is hot hot hot,as with all of ms garcia-aguilera's books i th...
2,one hot summer,i've not read any of ms aguilera's works befor...
3,i love this book,this romance novel is right up there with the ...
4,one hot book,carolina garcia aguilera has done it again she...


In [14]:
# Step 3 - Join review text and summary to avoid doing double processing

# function to join summary and review
def join_summary_review(summary, reviewText):

    #list to combine both texts
    combined = []

    for s,r in zip(summary, reviewText):

        # Check combinations of review and summary to decide how to join

        #  If both are empty
        if s == "" and r == "":
            combined.append("")

        # If both have text
        elif s != "" and r != "":
           combined.append(s + " " + r)

        # If only summary has text
        elif s != "":
            combined.append(s)
        else: # If only review has text 
            combined.append(r)

    return combined

In [26]:
# Step 3 - Test join function and Spacy processing

# List to save results - each element will be one row of summary + review processed
outputs = []

# Stopwords to keep
stopwords_keep = ["no", "not", "never"]
remove_symbs = ["-", "(", ")"]

# Join columns to process together
texts = join_summary_review(df_2['clean_summary'], df_2['clean_review'])

# Process rows
for doc in nlp.pipe(texts):

    # List to save tokens of this row
    row_tokens= []

    for token in doc: # Doc is the result of processing a text

        # When processing texts like don't, the negation is not recognized so there is a validation needed
        if "n't" in token.text: #token.text returns the original word  
            row_tokens.append("not")  # add the negation 
            continue  # ignore the rest of processing for this token because the negation is already added

        if token.text in remove_symbs:
            continue  # skip token so it does not get added to the list

        # If not a n't, lematize
        lemma = token.lemma_  # lemma_ returns string

        # Verify stopwords
        if token.is_stop == True:  
            # Check if it is in the list of stopwords to keep
            if lemma not in stopwords_keep:
                continue  # skip token so it does not get added to the list

        # Add lemma to the row list
        row_tokens.append(lemma)

    # Convert list of row token in a string to save it 
    row_output = " ".join(row_tokens)

    # Save in the main list
    outputs.append(row_output)


for output in outputs:
    print(output)



hit spot time read garcia aguilera come book live regis kelly book exactly look hit spot enjoy book write start book keep come culture family friendship romance look little romance pick book end turn right love main chartachter margarita aka daisy never miami way daisy tell story certainly feel go daisy peril close book feeling grow emotionally
hot summer hot hot hot ms garcia aguilera book think read impossible successful deviation past lupe solano series capture essence excitement local color diverse fabric miami sensual culturally enlighten
hot summer not read ms aguilera work have finish hot summer go check lupe solano series hear hot summer sooo steamy want miami not book
love book romance novel right rest amazing mystery novel guy little hesitant read romance novel book shot huge fan garcia aguilera book honest absolutely love book love way present funky miami crazy cubans not book book garcia aguilera superb job book not wait till book get to read book
hot book carolina garcia a

In [34]:
# Step 4 - Test join function from combine_columns module

t = cc.join_summary_review(df_2["clean_summary"], df_2["clean_review"])

t

["hit the spot this was the first time i read garcia-aguilera i came upon the name of this book on live with regis and kelly this book was exactly what i was looking for it hit the spot i really enjoyed this book because it was well written once i started this book it kept me coming back for more it had culture family friendship and romance i was looking for a little more romance when i picked this book but in the end it turned out to be just right i love the main chartachter margarita (aka daisy) i've never been to miami but the way daisy told the story i certainly felt i'd been there also after going through all of daisy's perils i closed the book with a feeling i had grown emotionally as well",
 "one hot summer is hot hot hot as with all of ms garcia-aguilera's books i think this is a must read impossible to put down successful deviation from past lupe solano series-captures the very essence of the excitement local color and diverse fabric of miami sensual and culturally enlightened

In [None]:
# Step 5 - Test spacy_processing function 
spacy_outputs = sp.spacy_processing(t)

spacy_outputs

['hit spot time read garcia aguilera come book live regis kelly book exactly look hit spot enjoy book write start book keep come culture family friendship romance look little romance pick book end turn right love main chartachter margarita aka daisy never miami way daisy tell story certainly feel go daisy peril close book feeling grow emotionally',
 'hot summer hot hot hot ms garcia aguilera book think read impossible successful deviation past lupe solano series capture essence excitement local color diverse fabric miami sensual culturally enlighten',
 'hot summer not read ms aguilera work have finish hot summer go check lupe solano series hear hot summer sooo steamy want miami not book',
 'love book romance novel right rest amazing mystery novel guy little hesitant read romance novel book shot huge fan garcia aguilera book honest absolutely love book love way present funky miami crazy cubans not book book garcia aguilera superb job book not wait till book get to read book',
 'hot book

In [63]:
# Step 6 - Text using spacy reading without pandas to avoid memory issues

# Use sample_2 for reference
sum_parquet= (sample_2["clean_summary"]).to_pylist()
rev_parquet= (sample_2["clean_review"]).to_pylist()

# Join columns to process together
testing = cc.join_summary_review(sum_parquet, rev_parquet)
testing


["hit the spot this was the first time i read garcia-aguilera i came upon the name of this book on live with regis and kelly this book was exactly what i was looking for it hit the spot i really enjoyed this book because it was well written once i started this book it kept me coming back for more it had culture family friendship and romance i was looking for a little more romance when i picked this book but in the end it turned out to be just right i love the main chartachter margarita (aka daisy) i've never been to miami but the way daisy told the story i certainly felt i'd been there also after going through all of daisy's perils i closed the book with a feeling i had grown emotionally as well",
 "one hot summer is hot hot hot as with all of ms garcia-aguilera's books i think this is a must read impossible to put down successful deviation from past lupe solano series-captures the very essence of the excitement local color and diverse fabric of miami sensual and culturally enlightened

In [None]:
# Step 6 - Test spacy_for_embeddings function
sfe.embedding_parquet()

Looking for parquet files to process...
5 files found.


In [8]:
# Step 7 - Review output of embedding_parquet function with just one chunk

path2 = Path("data/processed/spacy/Electronics_spacy.parquet")
parquet_file2 = pq.ParquetFile(path2)
test_1 = parquet_file2.read_row_group(0)
df_3 = test_1.to_pandas().head(5)

check_test= (test_1["clean_embedding_text"]).to_pylist()

#df_3.head()
check_test

#source_name = Path(path2.stem)
#print(source_name)
#source_name = source_name.stem
#print(source_name)


#df_3.head()
#source_name

['hit spot time read garcia   aguilera came book live regis kelly book exactly looking hit spot enjoyed book written started book kept coming culture family friendship romance looking little romance picked book end turned right love main chartachter margarita   aka daisy   never miami way daisy told story certainly felt going daisy perils closed book feeling grown emotionally',
 'hot summer hot hot hot ms garcia   aguilera books think read impossible successful deviation past lupe solano series   captures essence excitement local color diverse fabric miami sensual culturally enlightened',
 'hot summer not read ms aguilera works having finished hot summer going check lupe solano series heard hot summer sooo steamy want miami not book',
 'love book romance novel right rest amazing mystery novels guy little hesitant reading romance novel book shot huge fan garcia   aguilera books honest absolutely loved book love way presents funky miami crazy cubans not book books garcia   aguilera super

In [None]:
# Step 8 - Now that the files are processed, merge them into one parquet

merge_parquet.merge_par()
# Se tardó 3 min

Creating file: dataset_embedding_spacy.parquet
Saving to parquet succesful
