In [2]:
# Test file explore_data
import os
os.chdir("D:/amazon-reviews") # Run always on the root to avoid problems


import pandas as pd
from importlib import reload
from pathlib import Path
from langdetect import detect
from src.features.select_columns import split_columns  # import function
from src.text.lang_detection import detect_lan, is_english
import src.text.clean_text as ct
reload(ct)

<module 'src.text.clean_text' from 'D:\\amazon-reviews\\src\\text\\clean_text.py'>

In [3]:
# Step 1: Function to load just a number of columns to properly explore data set

def preview_columns(file_path, n=200):
    """
    Loads a small preview of the dataset (first n rows) to avoids memory errors.
    
    """
    f_path = Path(file_path)
    if not f_path.exists():
        print(f"Path {f_path} not found.")
        return None

    print(f"Previewing first {n} rows from: {f_path.name}")
    df_preview = pd.read_json(f_path, lines=True, nrows=n)
    return df_preview


df_preview = preview_columns("data/raw/Electronics.json.gz")
df_preview.head()

#print(df_preview.columns)

Previewing first 200 rows from: Electronics.json.gz


Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote
0,5,True,"07 17, 2002",A1N070NS9CJQ2I,60009810,{'Format:': ' Hardcover'},Teri Adams,This was the first time I read Garcia-Aguilera...,Hit The Spot!,1026864000,
1,5,False,"07 6, 2002",A3P0KRKOBQK1KN,60009810,{'Format:': ' Hardcover'},Willa C.,"As with all of Ms. Garcia-Aguilera's books, I ...",one hot summer is HOT HOT HOT!,1025913600,
2,5,False,"07 3, 2002",A192HO2ICJ75VU,60009810,{'Format:': ' Hardcover'},Kit,I've not read any of Ms Aguilera's works befor...,One Hot Summer,1025654400,2.0
3,4,False,"06 30, 2002",A2T278FKFL3BLT,60009810,{'Format:': ' Hardcover'},Andres,This romance novel is right up there with the ...,I love this book!,1025395200,3.0
4,5,False,"06 28, 2002",A2ZUXVTW8RXBXW,60009810,{'Format:': ' Hardcover'},John,Carolina Garcia Aguilera has done it again. S...,One Hot Book,1025222400,


In [4]:
# Step 2 Just load the columns of each file

data_dir = Path("data/raw")
files = sorted(data_dir.glob("*.json.gz")) + sorted(data_dir.glob("*.json")) # glob returns a list of all the files that end in .json

print(f"{len(files)} files found\n")

for fp in files:
    col_prev = pd.read_json(fp, lines=True, nrows=50)  
    print(f"{fp.name} has {len(col_prev.columns)} columns")
    print(sorted(col_prev.columns))
    print(" ")



4 files found

Electronics.json.gz has 11 columns
['asin', 'overall', 'reviewText', 'reviewTime', 'reviewerID', 'reviewerName', 'style', 'summary', 'unixReviewTime', 'verified', 'vote']
 
Industrial_and_Scientific.json.gz has 10 columns
['asin', 'overall', 'reviewText', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime', 'verified', 'vote']
 
Musical_Instruments.json.gz has 11 columns
['asin', 'overall', 'reviewText', 'reviewTime', 'reviewerID', 'reviewerName', 'style', 'summary', 'unixReviewTime', 'verified', 'vote']
 
Video_Games.json.gz has 10 columns
['asin', 'overall', 'reviewText', 'reviewTime', 'reviewerID', 'reviewerName', 'summary', 'unixReviewTime', 'verified', 'vote']
 


In [5]:
# Step 3: test the column separation functionn

# Use a dataset
df_test = pd.read_json("data/raw/Musical_Instruments.json.gz", lines=True, nrows=50)
cols = df_test.columns

# Original columns
print("Dataset columns:")
print(cols)


# Separated columns
print("\nFunction split_columns():")
print(split_columns(cols))


Dataset columns:
Index(['overall', 'vote', 'verified', 'reviewTime', 'reviewerID', 'asin',
       'style', 'reviewerName', 'reviewText', 'summary', 'unixReviewTime'],
      dtype='object')

Function split_columns():
{'text': ['reviewText', 'summary'], 'context': ['asin', 'overall', 'reviewTime', 'unixReviewTime', 'reviewerID', 'reviewerName', 'verified', 'vote', 'style']}


In [6]:
# Step 4: Test data cleaning

test_cases = [
    "  EASY TO UNDERSTAND AND A PROMPT    SERVICE TOO	 !!!!!!!!!!! 😍😍 ",
    "💀💀 URL TeStInG: www.ilovepdf.com  i love it 🐍 so much let's test it out ",
    None,
    "  Already clean text ❌✅🎯",
    "🧠🎤Arrived as 🎮🔖 described. Very happy!."
]

for cases in test_cases:
    print(f"Original: {cases}")
    print(f"Cleaned:  {ct.clean(cases)}")
    print(" ")

Original:   EASY TO UNDERSTAND AND A PROMPT    SERVICE TOO	 !!!!!!!!!!! 😍😍 
Cleaned:  easy to understand and a prompt service too
 
Original: 💀💀 URL TeStInG: www.ilovepdf.com  i love it 🐍 so much let's test it out 
Cleaned:  url testing i love it so much let's test it out
 
Original: None
Cleaned:  
 
Original:   Already clean text ❌✅🎯
Cleaned:  already clean text
 
Original: 🧠🎤Arrived as 🎮🔖 described. Very happy!.
Cleaned:  arrived as described very happy
 


In [8]:
# Step 5 - Try language detection and verify if english is the most common language

# Take the df previews used in step 1 

samples = df_preview["reviewText"].dropna().head(200).tolist() # drop Nans and use the 200 rows as sample

lang_count = {}  # dictionary to save the data

for review in samples:
    try:
        lang = detect(review)
    except:
        lang = "unknown"
    
    # Fill the dictionary 
    if lang in lang_count:
        lang_count[lang] += 1
    else:
        lang_count[lang] = 1

print("This result is for the file: Electronics")
lang_count

This result is for the file: Electronics


{'en': 198, 'id': 1, 'tl': 1}

In [None]:
# Step 6 - Try the module of language detection


# Test with the same text sample of the actual dataset

english_count = 0

for text in samples:
    lang = detect_lan(text)
    eng = is_english(text)
    if eng==True:
        english_count += 1

# Count english 
print(f"Total samples in english: {english_count} of {len(samples)}")


Total samples in english: 191 of 200
