In [29]:
import pandas as pd
import re

# Read the data

In [62]:
data = pd.read_csv('Amazon.csv')
data.columns

Index(['product_id', 'product_name', 'category', 'discounted_price',
       'actual_price', 'discount_percentage', 'rating', 'rating_count',
       'about_product', 'user_id', 'user_name', 'review_id', 'review_title',
       'review_content', 'img_link', 'product_link'],
      dtype='object')

# Remove the columns that are not needed

In [63]:
data = data[['product_name', 'about_product', 'review_title', 'review_content', 'img_link']]
data.columns

Index(['product_name', 'about_product', 'review_title', 'review_content',
       'img_link'],
      dtype='object')

In [44]:
data.head()

Unnamed: 0,product_name,about_product,review_title,review_content,img_link
0,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,Connects your computer to a high-speed wireles...,"good tool to use for,Brand is always good,Over...",good quality tool from d linkWiFi signal is go...,https://m.media-amazon.com/images/I/31+NwZ8gb1...
1,D-Link DWA-131 300 Mbps Wireless Nano USB Adap...,Connects your computer to a high-speed wireles...,"good tool to use for,Brand is always good,Over...",good quality tool from d linkWiFi signal is go...,https://m.media-amazon.com/images/W/WEBP_40237...
2,TP-Link Nano USB WiFi Dongle 150Mbps High Gain...,150 Mbps Wi-Fi —— Exceptional wireless speed u...,Works on linux for me. Get the model with ante...,I use this to connect an old PC to internet. I...,https://m.media-amazon.com/images/I/31Wb+A3VVd...
3,Duracell Plus AAA Rechargeable Batteries (750 ...,Duracell Rechargeable AAA 750mAh batteries sta...,"Works Good,Perfect replacement cell for trimme...","Works good,Bought it to replace my Phillips QT...",https://m.media-amazon.com/images/I/418YrbHVLC...
4,"Logitech B100 Wired USB Mouse, 3 yr Warranty, ...","A comfortable, ambidextrous shape feels good i...","Handy Mouse,Good quality mouse,Good one.,Good,...","Liked this Product,https://m.media-amazon.com/...",https://m.media-amazon.com/images/I/31iFF1Kbkp...


# Merge Amazon Scrap data

In [45]:
!wget -O amazon-scrap.csv https://comp576-amazon-raw-files.s3.amazonaws.com/amazon-scrap.csv

--2023-11-28 13:46:40--  https://comp576-amazon-raw-files.s3.amazonaws.com/amazon-scrap.csv
正在查找主機 comp576-amazon-raw-files.s3.amazonaws.com (comp576-amazon-raw-files.s3.amazonaws.com)... 3.5.11.119, 52.216.216.209, 52.217.104.156, ...
正在連接 comp576-amazon-raw-files.s3.amazonaws.com (comp576-amazon-raw-files.s3.amazonaws.com)|3.5.11.119|:443... 連上了。
已送出 HTTP 要求，正在等候回應... 200 OK
長度: 1951873 (1.9M) [text/csv]
儲存到：「amazon-scrap.csv」


2023-11-28 13:46:41 (5.57 MB/s) - 已儲存 「amazon-scrap.csv」 [1951873/1951873]



In [64]:
df_scrap = pd.read_csv('amazon-scrap.csv')

data = pd.concat([data, df_scrap], ignore_index=True)

# Remove the text in brackets in product_name

In [65]:
data['product_name'] = data['product_name'].str.replace(r"\(.*?\)", "")

  data['product_name'] = data['product_name'].str.replace(r"\(.*?\)", "")


In [66]:
from cleantext import clean

def clean_text(text, remove_comma=False):

    # Remove non-English words but keep '.', ',', '!'
    text = re.sub('[^a-zA-Z0-9 \n.,!-]', '', text)

    # Remove comma if nothing follows
    text = ','.join([word for word in text.split(',') if word.strip()])

    # # Add space to number and unit, ex: 150ft -> 150 ft
    text = re.sub(r'(\d+)([A-Za-z]+)', r'\1 \2', text)

    # Remove all text after ','
    if remove_comma:
        text = re.sub(r'[\,].*', '', text)

    # # Remove quote
    text = text.replace('"', '').replace("'", '')

    text = clean(text,
              fix_unicode=True,               # fix various unicode errors
              to_ascii=True,                  # transliterate to closest ASCII representation
              lower=True,                     # lowercase text
              no_line_breaks=False,           # fully strip line breaks as opposed to only normalizing them
              no_urls=True,                   # replace all URLs with a special token
              no_emails=False,                # replace all email addresses with a special token
              no_phone_numbers=False,         # replace all phone numbers with a special token
              no_numbers=False,               # replace all numbers with a special token
              no_digits=False,                # replace all digits with a special token
              no_currency_symbols=False,      # replace all currency symbols with a special token
              no_punct=False,                 # remove punctuations
              no_emoji=True,                  # remove emojis
              replace_with_punct="",          # instead of removing punctuations you may replace them
              replace_with_url="",
              replace_with_email="",
              replace_with_phone_number="",
              replace_with_number="<NUMBER>",
              replace_with_currency_symbol="",
              lang="en"                       # set to 'de' for German special handling
              )
    return text

# Remove URLs and non-English words

Remove row if column is nan


In [67]:
data = data[data['review_content'].notna()]

In [70]:
data['product_name'] = data['product_name'].apply(clean_text, remove_comma=True)
# data['about_product'] = data['about_product'].apply(clean_text)
data['review_title'] = data['review_title'].apply(clean_text)
data['review_content'] = data['review_content'].apply(clean_text)

# Download the images

In [87]:
!mkdir image

In [11]:
data['img_link'].head()

0    https://m.media-amazon.com/images/I/31+NwZ8gb1...
1    https://m.media-amazon.com/images/W/WEBP_40237...
2    https://m.media-amazon.com/images/I/31Wb+A3VVd...
3    https://m.media-amazon.com/images/I/418YrbHVLC...
4    https://m.media-amazon.com/images/I/31iFF1Kbkp...
Name: img_link, dtype: object

In [51]:
import requests
from PIL import Image

def download_image(url, file_name, index):
    r = requests.get(url, stream=True)

    # First check if the image is already downloaded
    try:
        img = Image.open('image/' + str(index) + '.jpg')

        return True
    except:
        # If not, download it
        if r.status_code == 200:
            with open(file_name, 'wb') as f:
                for chunk in r:
                    f.write(chunk)

            return True

        else:
            print('Image couldn\'t be retrieved for ' + file_name, url)
            return False

In [52]:
skipped_index = set()

for i, row in data.iterrows():
    # if image exists, skip
    exist = download_image(row['img_link'], 'image/' + str(i) + '.jpg', i)

    if not exist:
        skipped_index.add(i)

Image couldn't be retrieved for image/9.jpg https://m.media-amazon.com/images/W/WEBP_402378-T2/images/I/41vJcrdr5mL._SY300_SX300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/10.jpg https://m.media-amazon.com/images/W/WEBP_402378-T2/images/I/313jBpnrJVL._SX300_SY300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/11.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/21m+6LxEnOL._SY300_SX300_.jpg
Image couldn't be retrieved for image/12.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/31c6zDmtEnL._SY300_SX300_QL70_FMwebp_.jpg
Image couldn't be retrieved for image/19.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/31M+TYWPdQL._SY300_SX300_.jpg
Image couldn't be retrieved for image/20.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/316Q0fvU+2L._SY300_SX300_.jpg
Image couldn't be retrieved for image/21.jpg https://m.media-amazon.com/images/W/WEBP_402378-T1/images/I/41Xp77o+-YL._SX300_SY300_.jpg
Image couldn't be re

In [71]:
len(skipped_index)

312

In [72]:
# Drop the rows that don't have images
data = data.drop(skipped_index)

In [73]:
len(data)

5998

# Save the data

In [74]:
data.to_csv('Amazon_cleaned.csv', index_label='index')

In [59]:
from PIL import Image

# check if every image is downloaded

for i, row in data.iterrows():
    try:
        img = Image.open('image/' + str(i) + '.jpg')
    except:
        print(i)

In [75]:
!zip -r -qq image.zip image

In [None]:
!aws s3 ls --profile iamadmin-general

2022-02-12 05:33:10 cf-templates-7x4o2u7ssxah-us-east-1
2023-11-16 13:07:55 comp576-image-data


In [76]:
!aws s3 cp image.zip s3://comp576-image-data/ --acl public-read

zsh:1: command not found: aws


In [None]:
!aws s3 cp image.zip s3://comp576-image-data/ --acl public-read --profile iamadmin-general

upload: ./image.zip to s3://comp576-image-data/image.zip          


In [None]:
!aws s3 ls s3://comp576-image-data/ --profile iamadmin-general

2023-11-20 18:18:00    7369597 image.zip


# Object url: https://comp576-image-data.s3.us-east-2.amazonaws.com/image.zip