In [180]:
import pandas as pd
try:
    products = pd.read_csv('/Users/nani/Desktop/Fetch/PRODUCTS_TAKEHOME.csv')
    transactions = pd.read_csv('/Users/nani/Desktop/Fetch/TRANSACTION_TAKEHOME.csv')
    users = pd.read_csv('/Users/nani/Desktop/Fetch/USER_TAKEHOME.csv')
    print(f"Products columns: {products.columns}")
    print(f"Transactions columns: {transactions.columns}")
    print(f"Users columns: {users.columns}")
except FileNotFoundError:
    print("Error: One or more CSV files not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Products columns: Index(['CATEGORY_1', 'CATEGORY_2', 'CATEGORY_3', 'CATEGORY_4', 'MANUFACTURER',
       'BRAND', 'BARCODE'],
      dtype='object')
Transactions columns: Index(['RECEIPT_ID', 'PURCHASE_DATE', 'SCAN_DATE', 'STORE_NAME', 'USER_ID',
       'BARCODE', 'FINAL_QUANTITY', 'FINAL_SALE'],
      dtype='object')
Users columns: Index(['ID', 'CREATED_DATE', 'BIRTH_DATE', 'STATE', 'LANGUAGE', 'GENDER'], dtype='object')


In [181]:
# Checking for missing values
print("\n PRODUCTS_TAKEHOME.csv ")
print(" (rows, columns):", products.shape)
print("Missing values:\n", products.isnull().sum(), "\n")
print("Duplicate rows:", products.duplicated().sum())
product_fields = ["CATEGORY_1","CATEGORY_2","CATEGORY_3", "CATEGORY_4", "MANUFACTURER", "BRAND", "BARCODE"]
for col in product_fields:
    missing = products[col].isnull().sum()
    print(f"Column '{col}' missing entries: {missing}")


 PRODUCTS_TAKEHOME.csv 
 (rows, columns): (845552, 7)
Missing values:
 CATEGORY_1         111
CATEGORY_2        1424
CATEGORY_3       60566
CATEGORY_4      778093
MANUFACTURER    226474
BRAND           226472
BARCODE           4025
dtype: int64 

Duplicate rows: 215
Column 'CATEGORY_1' missing entries: 111
Column 'CATEGORY_2' missing entries: 1424
Column 'CATEGORY_3' missing entries: 60566
Column 'CATEGORY_4' missing entries: 778093
Column 'MANUFACTURER' missing entries: 226474
Column 'BRAND' missing entries: 226472
Column 'BARCODE' missing entries: 4025


In [182]:
print("\n TRANSACTION_TAKEHOME.csv ")
print(" (rows, columns):", transactions.shape)
print("Missing values:\n", transactions.isnull().sum(), "\n")
print("Duplicate rows:", transactions.duplicated().sum())

print("BARCODE missing entries:", transactions["BARCODE"].isnull().sum())

print("Data type of 'FINAL_SALE':", transactions["FINAL_SALE"].dtype)
print("Data type of 'FINAL_QUANTITY':", transactions["FINAL_QUANTITY"].dtype)

print("Unique values in 'FINAL_SALE':", transactions["FINAL_SALE"].unique())
print("Unique values in 'FINAL_QUANTITY':", transactions["FINAL_QUANTITY"].unique())



 TRANSACTION_TAKEHOME.csv 
 (rows, columns): (50000, 8)
Missing values:
 RECEIPT_ID           0
PURCHASE_DATE        0
SCAN_DATE            0
STORE_NAME           0
USER_ID              0
BARCODE           5762
FINAL_QUANTITY       0
FINAL_SALE           0
dtype: int64 

Duplicate rows: 171
BARCODE missing entries: 5762
Data type of 'FINAL_SALE': object
Data type of 'FINAL_QUANTITY': object
Unique values in 'FINAL_SALE': [' ' '1.49' '3.49' ... '11.02' '20.17' '42.38']
Unique values in 'FINAL_QUANTITY': ['1.00' 'zero' '2.00' '3.00' '4.00' '4.55' '2.83' '2.34' '0.46' '7.00'
 '18.00' '12.00' '5.00' '2.17' '0.23' '8.00' '1.35' '0.09' '2.58' '1.47'
 '16.00' '0.62' '1.24' '1.40' '0.51' '0.53' '1.69' '6.00' '2.39' '2.60'
 '10.00' '0.86' '1.54' '1.88' '2.93' '1.28' '0.65' '2.89' '1.44' '2.75'
 '1.81' '276.00' '0.87' '2.10' '3.33' '2.54' '2.20' '1.93' '1.34' '1.13'
 '2.19' '0.83' '2.61' '0.28' '1.50' '0.97' '0.24' '1.18' '6.22' '1.22'
 '1.23' '2.57' '1.07' '2.11' '0.48' '9.00' '3.11' '1.08' '5

In [183]:
print("\n========== USER_TAKEHOME.csv ==========")
print(" (rows, columns):", users.shape)
print("Missing values:\n", users.isnull().sum(), "\n")
print("Duplicate rows:", users.duplicated().sum())

user_fields = ["BIRTH_DATE", "STATE", "LANGUAGE", "GENDER"]
for col in user_fields:
    missing = users[col].isnull().sum()
    print(f"Column '{col}' missing entries: {missing}")


 (rows, columns): (100000, 6)
Missing values:
 ID                  0
CREATED_DATE        0
BIRTH_DATE       3675
STATE            4812
LANGUAGE        30508
GENDER           5892
dtype: int64 

Duplicate rows: 0
Column 'BIRTH_DATE' missing entries: 3675
Column 'STATE' missing entries: 4812
Column 'LANGUAGE' missing entries: 30508
Column 'GENDER' missing entries: 5892


In [184]:
# Value counts to understand categorical distributions
print("User Genders:\n", df_users['GENDER'].value_counts(dropna=False), "\n")
print("User Languages:\n", df_users['LANGUAGE'].value_counts(dropna=False), "\n")
print("Transaction Store Names:\n", df_transactions['STORE_NAME'].value_counts(dropna=False).head(), "\n")


User Genders:
 female                    64240
male                      25829
NaN                        5892
transgender                1772
prefer_not_to_say          1350
non_binary                  473
unknown                     196
not_listed                  180
Non-Binary                   34
not_specified                28
My gender isn't listed        5
Prefer not to say             1
Name: GENDER, dtype: int64 

User Languages:
 en        63403
NaN       30508
es-419     6089
Name: LANGUAGE, dtype: int64 

Transaction Store Names:
 WALMART                 21326
DOLLAR GENERAL STORE     2748
ALDI                     2640
KROGER                   1494
TARGET                   1484
Name: STORE_NAME, dtype: int64 



In [185]:
print("Product Columns:", products.columns.tolist())
print("Transaction Columns:", transactions.columns.tolist())
print("User Columns:", users.columns.tolist())


Product Columns: ['CATEGORY_1', 'CATEGORY_2', 'CATEGORY_3', 'CATEGORY_4', 'MANUFACTURER', 'BRAND', 'BARCODE']
Transaction Columns: ['RECEIPT_ID', 'PURCHASE_DATE', 'SCAN_DATE', 'STORE_NAME', 'USER_ID', 'BARCODE', 'FINAL_QUANTITY', 'FINAL_SALE']
User Columns: ['ID', 'CREATED_DATE', 'BIRTH_DATE', 'STATE', 'LANGUAGE', 'GENDER']


In [186]:
# Convert potential date columns
df_users['CREATED_DATE'] = pd.to_datetime(df_users['CREATED_DATE'], errors='coerce')
df_users['BIRTH_DATE'] = pd.to_datetime(df_users['BIRTH_DATE'], errors='coerce')
df_transactions['PURCHASE_DATE'] = pd.to_datetime(df_transactions['PURCHASE_DATE'], errors='coerce')
df_transactions['SCAN_DATE'] = pd.to_datetime(df_transactions['SCAN_DATE'], errors='coerce')


In [187]:
print("Data types after conversion:")
print("CREATED_DATE:", df_users['CREATED_DATE'].dtype)
print("BIRTH_DATE:", df_users['BIRTH_DATE'].dtype)
print("PURCHASE_DATE:", df_transactions['PURCHASE_DATE'].dtype)
print("SCAN_DATE:", df_transactions['SCAN_DATE'].dtype)



Data types after conversion:
CREATED_DATE: datetime64[ns, UTC]
BIRTH_DATE: datetime64[ns, UTC]
PURCHASE_DATE: datetime64[ns]
SCAN_DATE: datetime64[ns, UTC]


In [188]:
print("\n Users - Sample Date Fields:")
print(df_users[['CREATED_DATE', 'BIRTH_DATE']].sample(5))

print("\n Transactions - Sample Date Fields:")
print(df_transactions[['PURCHASE_DATE', 'SCAN_DATE']].sample(5))



 Users - Sample Date Fields:
                   CREATED_DATE                BIRTH_DATE
31936 2020-03-06 23:11:09+00:00 1993-08-21 00:00:00+00:00
63805 2024-03-28 20:16:51+00:00 1985-02-02 07:00:00+00:00
15855 2019-02-16 23:27:16+00:00 1987-02-12 06:00:00+00:00
71507 2021-04-13 21:56:45+00:00 1967-12-30 16:56:57+00:00
53450 2022-01-23 02:45:41+00:00 1984-06-15 05:00:00+00:00

 Transactions - Sample Date Fields:
      PURCHASE_DATE                        SCAN_DATE
3611     2024-07-12 2024-07-17 12:14:06.171000+00:00
33688    2024-07-17 2024-07-28 19:13:43.325000+00:00
11707    2024-06-18 2024-06-21 12:17:35.694000+00:00
40349    2024-07-18 2024-07-31 07:31:20.629000+00:00
46360    2024-08-18 2024-08-18 12:16:35.920000+00:00


In [189]:
print("\n Null (NaT) values after conversion:")
print("CREATED_DATE invalid entries:", df_users['CREATED_DATE'].isnull().sum())
print("BIRTH_DATE invalid entries:", df_users['BIRTH_DATE'].isnull().sum())
print("PURCHASE_DATE invalid entries:", df_transactions['PURCHASE_DATE'].isnull().sum())
print("SCAN_DATE invalid entries:", df_transactions['SCAN_DATE'].isnull().sum())



 Null (NaT) values after conversion:
CREATED_DATE invalid entries: 0
BIRTH_DATE invalid entries: 3675
PURCHASE_DATE invalid entries: 0
SCAN_DATE invalid entries: 0


In [190]:
print("\n Date Ranges:")
print("Users CREATED_DATE:", df_users['CREATED_DATE'].min(), "to", df_users['CREATED_DATE'].max())
print("Users BIRTH_DATE:", df_users['BIRTH_DATE'].min(), "to", df_users['BIRTH_DATE'].max())
print("Transactions PURCHASE_DATE:", df_transactions['PURCHASE_DATE'].min(), "to", df_transactions['PURCHASE_DATE'].max())
print("Transactions SCAN_DATE:", df_transactions['SCAN_DATE'].min(), "to", df_transactions['SCAN_DATE'].max())



 Date Ranges:
Users CREATED_DATE: 2014-04-18 23:14:55+00:00 to 2024-09-11 17:59:15+00:00
Users BIRTH_DATE: 1900-01-01 00:00:00+00:00 to 2022-04-03 07:00:00+00:00
Transactions PURCHASE_DATE: 2024-06-12 00:00:00 to 2024-09-08 00:00:00
Transactions SCAN_DATE: 2024-06-12 06:36:34.910000+00:00 to 2024-09-08 23:07:19.836000+00:00
