<a href="https://colab.research.google.com/github/lebe1/text-oriented-data-science-project/blob/main/Data_Preparation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Loading and Preparation

## Contect to google drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
folder_path = '/content/drive/MyDrive/DOPP_Ex2_data/'

## Imports

In [None]:
#!pip install stanza

In [None]:
import pandas as pd
import json
import os
import random
import stanza
import nltk
from nltk.corpus import stopwords
import re
from tqdm import tqdm
import string

## JSON to CSV Conversion

In [None]:
json_files = [
    "Clothing_Shoes_and_Jewelry_5.json",
    "All_Beauty_5.json",
    "AMAZON_FASHION_5.json",
    "Luxury_Beauty_5.json"
]

def json_to_csv(json_path, csv_path, line_limit=30000, sample_size=3000, seed=42):
    random.seed(seed)
    data = []

    with open(json_path, 'r') as file:
        for i, line in enumerate(file):
            if i >= line_limit:
                break
            data.append(json.loads(line))

    if len(data) > sample_size:
        data = random.sample(data, sample_size)

    df = pd.DataFrame(data)
    df.to_csv(csv_path, index=False)
    print(f"Converted {json_path} to {csv_path} with {len(df)} samples.")

In [None]:
for json_file in json_files:
    json_path = os.path.join(folder_path, json_file)
    csv_file = json_file.replace('.json', '.csv')
    csv_path = os.path.join(folder_path, csv_file)
    json_to_csv(json_path, csv_path, line_limit=30000, sample_size=3000, seed=42)

Converted /content/drive/MyDrive/DOPP_Ex2_data/Clothing_Shoes_and_Jewelry_5.json to /content/drive/MyDrive/DOPP_Ex2_data/Clothing_Shoes_and_Jewelry_5.csv with 3000 samples.
Converted /content/drive/MyDrive/DOPP_Ex2_data/All_Beauty_5.json to /content/drive/MyDrive/DOPP_Ex2_data/All_Beauty_5.csv with 3000 samples.
Converted /content/drive/MyDrive/DOPP_Ex2_data/AMAZON_FASHION_5.json to /content/drive/MyDrive/DOPP_Ex2_data/AMAZON_FASHION_5.csv with 3000 samples.
Converted /content/drive/MyDrive/DOPP_Ex2_data/Luxury_Beauty_5.json to /content/drive/MyDrive/DOPP_Ex2_data/Luxury_Beauty_5.csv with 3000 samples.


## Importing CSV Files

In [None]:
csv_files = [
    "All_Beauty_5.csv",
    "AMAZON_FASHION_5.csv",
    "Clothing_Shoes_and_Jewelry_5.csv",
    "Luxury_Beauty_5.csv"
]

dataframes = {}

for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    df_name = csv_file.replace('.csv', '')
    dataframes[df_name] = pd.read_csv(csv_path)
    print(f"Loaded {csv_file} into dataframe '{df_name}' with {len(dataframes[df_name])} rows.")

All_Beauty_df = dataframes["All_Beauty_5"]
Amazon_Fashion_df = dataframes["AMAZON_FASHION_5"]
Clothing_Shoes_and_Jewelry_df = dataframes["Clothing_Shoes_and_Jewelry_5"]
Luxury_Beauty_df = dataframes["Luxury_Beauty_5"]

Loaded All_Beauty_5.csv into dataframe 'All_Beauty_5' with 3000 rows.
Loaded AMAZON_FASHION_5.csv into dataframe 'AMAZON_FASHION_5' with 3000 rows.
Loaded Clothing_Shoes_and_Jewelry_5.csv into dataframe 'Clothing_Shoes_and_Jewelry_5' with 3000 rows.
Loaded Luxury_Beauty_5.csv into dataframe 'Luxury_Beauty_5' with 3000 rows.


In [None]:
All_Beauty_df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,reviewerName,reviewText,summary,unixReviewTime,style,vote,image
0,5.0,True,"04 26, 2016",A1ZH9XEBQIPLWT,B00UWB35UY,JOYCE SCOTT,love them,Five Stars,1461628800,,,
1,5.0,True,"03 23, 2017",A2RFDGEW20UK6W,B000FI4S1E,Betty,I wish they continue with this fragrance,I love the smell of it.,1490227200,,,
2,5.0,False,"05 5, 2016",A3ETTJOVI6C9V5,B00006L9LC,W. Keane,"I LOVE the smell, the texture, everything abou...",I LOVE the smell,1462406400,{'Size:': ' 586'},,
3,5.0,True,"04 25, 2018",A1118RD3AJD5KH,B0012Y0ZG2,DL,works great,Five Stars,1524614400,{'Size:': ' 511'},,
4,5.0,True,"03 27, 2014",AGOH8N902URMW,B000URXP6E,Zeb,This gel is a genuine imported product from Fr...,My wife loves this product,1395878400,{'Size:': ' 10.2 oz'},2.0,


In [None]:
columns_to_keep = ["overall", "reviewTime", "reviewerID", "reviewText", "summary", "unixReviewTime"]


filtered_dataframes = {}

for csv_file in csv_files:
    csv_path = os.path.join(folder_path, csv_file)
    df = pd.read_csv(csv_path)

    df_filtered = df[columns_to_keep].copy()

    category_name = csv_file.replace("_5.csv", "").replace("AMAZON_FASHION", "Amazon_Fashion")
    df_filtered['category'] = category_name

    filtered_dataframes[category_name] = df_filtered

In [None]:
All_Beauty_df = filtered_dataframes["All_Beauty"]
Amazon_Fashion_df = filtered_dataframes["Amazon_Fashion"]
Clothing_Shoes_and_Jewelry_df = filtered_dataframes["Clothing_Shoes_and_Jewelry"]
Luxury_Beauty_df = filtered_dataframes["Luxury_Beauty"]

In [None]:
All_Beauty_df.head()

Unnamed: 0,overall,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category
0,5.0,"04 26, 2016",A1ZH9XEBQIPLWT,love them,Five Stars,1461628800,All_Beauty
1,5.0,"03 23, 2017",A2RFDGEW20UK6W,I wish they continue with this fragrance,I love the smell of it.,1490227200,All_Beauty
2,5.0,"05 5, 2016",A3ETTJOVI6C9V5,"I LOVE the smell, the texture, everything abou...",I LOVE the smell,1462406400,All_Beauty
3,5.0,"04 25, 2018",A1118RD3AJD5KH,works great,Five Stars,1524614400,All_Beauty
4,5.0,"03 27, 2014",AGOH8N902URMW,This gel is a genuine imported product from Fr...,My wife loves this product,1395878400,All_Beauty


In [None]:
combined_df = pd.concat([
    All_Beauty_df,
    Amazon_Fashion_df,
    Clothing_Shoes_and_Jewelry_df,
    Luxury_Beauty_df
], ignore_index=True)

combined_df = combined_df.sample(frac=1, random_state=42).reset_index(drop=True)

combined_df = combined_df.rename(columns={"overall": "rating"})

combined_df.head(15)

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category
0,5.0,"01 16, 2017",ASWLL1VJA7WOG,Great product... just what I wanted. Works gr...,Five Stars,1484524800,All_Beauty
1,5.0,"12 8, 2008",A265K3A7V83112,"After seeing the popularity of this shoe, I de...",What can i say? chucks rock,1228694400,Clothing_Shoes_and_Jewelry
2,5.0,"02 8, 2013",A1D18EJF6LHYDV,I was nervousness about the scent because IVe ...,Smells great,1360281600,All_Beauty
3,5.0,"02 15, 2018",A25EOTX5I354I2,"I LOVE the smell. A bit expensive, so I cant b...",Five Stars,1518652800,Luxury_Beauty
4,5.0,"11 11, 2013",A1DFZPQPCHBYTY,Found this stuff in Japan and wondered if I co...,Super lathery nice soap!,1384128000,All_Beauty
5,5.0,"10 25, 2016",A2VBBEPR330C5C,I had never used a tinted sunscreen before and...,I had never used a tinted sunscreen before and...,1477353600,Luxury_Beauty
6,5.0,"06 19, 2017",ADTQ22MUSQFIR,Quick tranasaction!!! Loved the shoes!!! Wou...,Awesome shoes!!!,1497830400,Amazon_Fashion
7,5.0,"02 13, 2016",A34H7IZZ7SYL1E,My husband wore his first pair to death. They ...,Husbands fave shoe!,1455321600,Clothing_Shoes_and_Jewelry
8,5.0,"08 21, 2015",A1A7LP8GUKEPZM,"Great Product, Great Price!",Five Stars,1440115200,All_Beauty
9,5.0,"04 11, 2017",AT72GRKOXVE25,just do it :),Five Stars,1491868800,Amazon_Fashion


## NLP pipeline

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
stanza.download('en')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: en (English) ...
INFO:stanza:File exists: /root/stanza_resources/en/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources


In [None]:
nlp_pipeline = stanza.Pipeline('en', processors='tokenize,lemma')

INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json:   0%|   …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| lemma     | combined_nocharlm |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
INFO:stanza:Loading: mwt
INFO:stanza:Loading: lemma
INFO:stanza:Done loading processors!


In [None]:
tqdm.pandas()
stop_words = set(stopwords.words('english'))

def process_text(text):
    text = str(text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    doc = nlp_pipeline(text)
    tokens = [word.lemma for sentence in doc.sentences for word in sentence.words]
    filtered_tokens = [token.lower() for token in tokens if token.lower() not in stop_words]
    return filtered_tokens

In [None]:
combined_df['reviewToken'] = combined_df['reviewText'].progress_apply(process_text)

100%|██████████| 12000/12000 [06:14<00:00, 32.02it/s]


In [None]:
combined_df['reviewToken'].head(10)

Unnamed: 0,reviewToken
0,"[great, product, want, works, great, stylish]"
1,"[see, popularity, shoe, decide, test, impresse..."
2,"[nervousness, scent, ive, never, try, love, pa..."
3,"[love, smell, bit, expensive, buy, often, woul..."
4,"[found, stuff, japan, wonder, could, find, 3, ..."
...,...
11995,[work]
11996,"[always, buy, size, one, come, large]"
11997,"[love, super, comfortable, nice, get, expect, ..."
11998,[excellent]


## Save df as csv

In [None]:
combined_df['reviewTime'] = pd.to_datetime(combined_df['unixReviewTime'], unit='s')

In [None]:
combined_df.head(5)

Unnamed: 0,rating,reviewTime,reviewerID,reviewText,summary,unixReviewTime,category,reviewToken
0,5.0,2017-01-16,ASWLL1VJA7WOG,Great product... just what I wanted. Works gr...,Five Stars,1484524800,All_Beauty,"[great, product, want, works, great, stylish]"
1,5.0,2008-12-08,A265K3A7V83112,"After seeing the popularity of this shoe, I de...",What can i say? chucks rock,1228694400,Clothing_Shoes_and_Jewelry,"[see, popularity, shoe, decide, test, impresse..."
2,5.0,2013-02-08,A1D18EJF6LHYDV,I was nervousness about the scent because IVe ...,Smells great,1360281600,All_Beauty,"[nervousness, scent, ive, never, try, love, pa..."
3,5.0,2018-02-15,A25EOTX5I354I2,"I LOVE the smell. A bit expensive, so I cant b...",Five Stars,1518652800,Luxury_Beauty,"[love, smell, bit, expensive, buy, often, woul..."
4,5.0,2013-11-11,A1DFZPQPCHBYTY,Found this stuff in Japan and wondered if I co...,Super lathery nice soap!,1384128000,All_Beauty,"[found, stuff, japan, wonder, could, find, 3, ..."


In [None]:
output_path = '/content/drive/MyDrive/DOPP_Ex2_data/combined_reviews.csv'

combined_df.to_csv(output_path, index=False)