### Basic library imports

In [1]:
import os
import pandas as pd

### Read Dataset

In [2]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [3]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out.csv


In [4]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'metre', 'pint', 'yard', 'volt', 'millivolt', 'millilitre', 'gallon', 'foot', 'microgram', 'kilovolt', 'decilitre', 'kilogram', 'pound', 'litre', 'inch', 'fluid ounce', 'gram', 'cubic inch', 'microlitre', 'millimetre', 'kilowatt', 'milligram', 'centimetre', 'imperial gallon', 'watt', 'cubic foot', 'cup', 'quart', 'ton', 'ounce', 'centilitre'}


In [5]:
batch_size = 5000
batch = train.head(batch_size)

### Download images

In [6]:
from utils import download_images
download_images(batch['image_link'], '../images')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5000/5000 [01:45<00:00, 47.25it/s]


In [34]:
assert len(os.listdir('../images')) > 0

In [4]:
rm -rf ../images

## Let's understand dataset first

In [8]:
train

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263859 entries, 0 to 263858
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   image_link    263859 non-null  object
 1   group_id      263859 non-null  int64 
 2   entity_name   263859 non-null  object
 3   entity_value  263859 non-null  object
dtypes: int64(1), object(3)
memory usage: 8.1+ MB


In [10]:
train.nunique()

image_link      255906
group_id           750
entity_name          8
entity_value     16405
dtype: int64

In [11]:
train.describe()

Unnamed: 0,group_id
count,263859.0
mean,545809.847525
std,249488.149296
min,101697.0
25%,311997.0
50%,524635.0
75%,752266.0
max,998545.0


In [12]:
train.isna().sum()

image_link      0
group_id        0
entity_name     0
entity_value    0
dtype: int64

## No Null Values

In [13]:
print("Duplicate Rows : ", train.duplicated().sum())

Duplicate Rows :  0


In [14]:
duplicates = train[train.duplicated(subset=['entity_name'])]
duplicates

Unnamed: 0,image_link,group_id,entity_name,entity_value
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram
5,https://m.media-amazon.com/images/I/61QsBSE7jg...,731432,item_weight,1400 milligram
6,https://m.media-amazon.com/images/I/81xsq6vf2q...,731432,item_weight,1400 milligram
...,...,...,...,...
263854,https://m.media-amazon.com/images/I/612J1R1xHl...,558806,height,5.0 centimetre
263855,https://m.media-amazon.com/images/I/61Blzh2+28...,470067,height,8.5 inch
263856,https://m.media-amazon.com/images/I/51MsegDL9V...,204245,height,43.2 centimetre
263857,https://m.media-amazon.com/images/I/510KhVw4VS...,752266,height,9.1 centimetre


In [15]:
!pip install pytesseract easyocr paddleocr google-cloud-vision

Collecting pytesseract
  Using cached pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl.metadata (11 kB)
Collecting paddleocr
  Downloading paddleocr-2.8.1-py3-none-any.whl.metadata (19 kB)
Collecting google-cloud-vision
  Downloading google_cloud_vision-3.7.4-py2.py3-none-any.whl.metadata (5.2 kB)
Collecting torchvision>=0.5 (from easyocr)
  Using cached torchvision-0.19.1-cp312-cp312-macosx_11_0_arm64.whl.metadata (6.0 kB)
Collecting opencv-python-headless (from easyocr)
  Downloading opencv_python_headless-4.10.0.84-cp37-abi3-macosx_11_0_arm64.whl.metadata (20 kB)
Collecting scikit-image (from easyocr)
  Downloading scikit_image-0.24.0-cp312-cp312-macosx_12_0_arm64.whl.metadata (14 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.0-cp312-cp312-macosx_11_0_arm64.whl.metadata (4.6 kB)
Collecting Shapely (from easyocr)
  Downloading shapely-2.0.6-cp312-cp312-macosx_11_0_arm64.whl.metadata (7.0 kB)

In [16]:
!pip install pytesseract



In [18]:
import pytesseract
from PIL import Image
def tesseract_ocr(img):
  image = Image.open(img)
  text = pytesseract.image_to_string(image)
  return text

   # Usage
text = tesseract_ocr('/Users/anmol/Desktop/Work/amazonMLChallenge/student_resource 3/images/81e2YtCOKvL.jpg')
print(text)

ModuleNotFoundError: No module named 'pytesseract'

In [24]:
import easyocr

def easy_ocr(image_path):
  reader = easyocr.Reader(['en'])  # Initialize for English
  results = reader.readtext(image_path)
  return ' '.join([result[1] for result in results])

# Usage
text = easy_ocr('/Users/anmol/Desktop/Work/amazonMLChallenge/student_resource 3/images/81e2YtCOKvL.jpg')
print(text)

ModuleNotFoundError: No module named 'easyocr'

In [23]:
!pip install easyocr==1.1.4



In [25]:
import easyocr

def easy_ocr(image_path):
  reader = easyocr.Reader(['en'])  # Initialize for English
  results = reader.readtext(image_path)
  return ' '.join([result[1] for result in results])

# Usage
text = easy_ocr('/Users/anmol/Desktop/Work/amazonMLChallenge/student_resource 3/images/81e2YtCOKvL.jpg')
print(text)

ModuleNotFoundError: No module named 'easyocr'

In [26]:
import pandas as pd 

In [None]:
df = read_csv('/Users/anmol/Desktop/Work/amazonMLChallenge/student_resource 3/dataset/train.csv')

In [None]:
train

In [3]:
import pandas as pd

# Step 2: Read the original CSV file
original_csv = '/Users/anmol/Desktop/Work/amazonMLChallenge/student_resource 3/dataset/train.csv'
df = pd.read_csv(original_csv)

# Step 3: Select the first 100,000 rows
df_first_100k = df.head(100000)

# Step 4: Save the selected rows into a new CSV file
new_csv = '/Users/anmol/Desktop/Work/amazonMLChallenge/student_resource 3/dataset/claude.csv'
df_first_100k.to_csv(new_csv, index=False)