In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet('../data/electronics_sample_2M.parquet')

In [3]:
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 7 columns):
 #   Column          Dtype  
---  ------          -----  
 0   reviewerID      object 
 1   asin            object 
 2   overall         float32
 3   unixReviewTime  int64  
 4   title           object 
 5   brand           object 
 6   categories      object 
dtypes: float32(1), int64(1), object(5)
memory usage: 99.2+ MB
None


Unnamed: 0,reviewerID,asin,overall,unixReviewTime,title,brand,categories
0,AHAGVEUWD2IRJFPCJLOAYSJ7XNLQ,B008YQAG5Q,5.0,1393181267000,HP BD-R DL 6X 50GB Double Layer 10 Pack in Spi...,HP,"[Electronics, Computers & Accessories, Compute..."
1,AHYKM5KHAJNN3KQ2EXO25LCCKDFA,B0711V1WXC,2.0,1524659491257,Fintie Slim Case for Amazon Fire 7 Tablet (Pre...,Fintie,"[Electronics, Computers & Accessories, Tablet ..."
2,AHZC5B3S7CRPM5S476EIQJKWUS6Q,B00DOHVUIM,5.0,1414421305000,Kuzy Compatible with MacBook Keyboard Cover fo...,Kuzy,"[Electronics, Computers & Accessories, Compute..."
3,AGRTA2KJTZWKSFVLMSK2K7RGAKMQ,B000BNY64C,4.0,1376084786000,STK BP-511a BP-511 2 Pack Battery for Canon Re...,SterlingTek,"[Electronics, Camera & Photo, Accessories, Bat..."
4,AEYUTNEKMD56ZZQ3ZC7XOJDHGRLQ,B0BT9MK3XK,5.0,1613463323771,"Nulaxy Tablet Stand, Fully Adjustable Foldable...",Nulaxy,"[Electronics, Computers & Accessories, Tablet ..."


## Target Variable (binary)

In [4]:
# 1 if 'overall' is >= 4.0, and 0 otherwise.
df['positive'] = (df['overall'] >= 4.0).astype(int)


print("\nTarget variable distribution:")
print(df['positive'].value_counts(normalize=True))


Target variable distribution:
positive
1    0.800894
0    0.199106
Name: proportion, dtype: float64


## Light Feature Cleaning

In [7]:
missing_title = df['title'].isnull().sum()
missing_brand = df['brand'].isnull().sum()
null_categories = df['categories'].isnull().sum()

print(missing_title)
print(missing_brand)
print(null_categories)

0
1645
0


In [8]:
# Fill nulls for our text/categorical features
print("\nChecking and filling null values...")
df['brand'] = df['brand'].fillna('Unknown')

print("Feature cleaning complete.")


Checking and filling null values...
Feature cleaning complete.


In [9]:
missing_brand = df['brand'].isnull().sum()
print(missing_brand)

0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000000 entries, 0 to 1999999
Data columns (total 8 columns):
 #   Column          Dtype  
---  ------          -----  
 0   reviewerID      object 
 1   asin            object 
 2   overall         float32
 3   unixReviewTime  int64  
 4   title           object 
 5   brand           object 
 6   categories      object 
 7   positive        int64  
dtypes: float32(1), int64(2), object(5)
memory usage: 114.4+ MB


In [11]:
df.head()

Unnamed: 0,reviewerID,asin,overall,unixReviewTime,title,brand,categories,positive
0,AHAGVEUWD2IRJFPCJLOAYSJ7XNLQ,B008YQAG5Q,5.0,1393181267000,HP BD-R DL 6X 50GB Double Layer 10 Pack in Spi...,HP,"[Electronics, Computers & Accessories, Compute...",1
1,AHYKM5KHAJNN3KQ2EXO25LCCKDFA,B0711V1WXC,2.0,1524659491257,Fintie Slim Case for Amazon Fire 7 Tablet (Pre...,Fintie,"[Electronics, Computers & Accessories, Tablet ...",0
2,AHZC5B3S7CRPM5S476EIQJKWUS6Q,B00DOHVUIM,5.0,1414421305000,Kuzy Compatible with MacBook Keyboard Cover fo...,Kuzy,"[Electronics, Computers & Accessories, Compute...",1
3,AGRTA2KJTZWKSFVLMSK2K7RGAKMQ,B000BNY64C,4.0,1376084786000,STK BP-511a BP-511 2 Pack Battery for Canon Re...,SterlingTek,"[Electronics, Camera & Photo, Accessories, Bat...",1
4,AEYUTNEKMD56ZZQ3ZC7XOJDHGRLQ,B0BT9MK3XK,5.0,1613463323771,"Nulaxy Tablet Stand, Fully Adjustable Foldable...",Nulaxy,"[Electronics, Computers & Accessories, Tablet ...",1
