In [1]:
import pandas as pd

In [2]:
# Load dataset
df = pd.read_csv(r"C:\Users\kche8\OneDrive\Desktop\Dataset\Walmart.csv")

In [3]:
df.head()

Unnamed: 0,invoice_id,Branch,City,category,unit_price,quantity,date,time,payment_method,rating,profit_margin
0,1,WALM003,San Antonio,Health and beauty,$74.69,7.0,05/01/19,13:08:00,Ewallet,9.1,0.48
1,2,WALM048,Harlingen,Electronic accessories,$15.28,5.0,08/03/19,10:29:00,Cash,9.6,0.48
2,3,WALM067,Haltom City,Home and lifestyle,$46.33,7.0,03/03/19,13:23:00,Credit card,7.4,0.33
3,4,WALM064,Bedford,Health and beauty,$58.22,8.0,27/01/19,20:33:00,Ewallet,8.4,0.33
4,5,WALM013,Irving,Sports and travel,$86.31,7.0,08/02/19,10:37:00,Ewallet,5.3,0.48


In [4]:
df.describe()

Unnamed: 0,invoice_id,quantity,rating,profit_margin
count,10051.0,10020.0,10051.0,10051.0
mean,5025.74122,2.353493,5.825659,0.393791
std,2901.174372,1.602658,1.763991,0.090669
min,1.0,1.0,3.0,0.18
25%,2513.5,1.0,4.0,0.33
50%,5026.0,2.0,6.0,0.33
75%,7538.5,3.0,7.0,0.48
max,10000.0,10.0,10.0,0.57


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10051 entries, 0 to 10050
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   invoice_id      10051 non-null  int64  
 1   Branch          10051 non-null  object 
 2   City            10051 non-null  object 
 3   category        10051 non-null  object 
 4   unit_price      10020 non-null  object 
 5   quantity        10020 non-null  float64
 6   date            10051 non-null  object 
 7   time            10051 non-null  object 
 8   payment_method  10051 non-null  object 
 9   rating          10051 non-null  float64
 10  profit_margin   10051 non-null  float64
dtypes: float64(3), int64(1), object(7)
memory usage: 863.9+ KB


In [6]:
df.duplicated().sum()

51

In [7]:
df.isnull().sum()

invoice_id         0
Branch             0
City               0
category           0
unit_price        31
quantity          31
date               0
time               0
payment_method     0
rating             0
profit_margin      0
dtype: int64

In [8]:
# Standardize column names: lowercase, underscores, strip spaces
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
print(df.columns)

Index(['invoice_id', 'branch', 'city', 'category', 'unit_price', 'quantity',
       'date', 'time', 'payment_method', 'rating', 'profit_margin'],
      dtype='object')


In [9]:
# Standardize text fields
text_cols = ['city', 'branch', 'category', 'payment_method']
for col in text_cols:
    if col in df.columns:
        df[col] = df[col].str.lower().str.strip()
    else:
        print(f"Column '{col}' not found.")

In [10]:
# Convert date column
df['date'] = pd.to_datetime(
    df['date'].astype(str).str.strip(),
    format='%d/%m/%y',
    errors='coerce'
)

In [11]:
print(df.head(10))

   invoice_id   branch         city                category unit_price  \
0           1  walm003  san antonio       health and beauty     $74.69   
1           2  walm048    harlingen  electronic accessories     $15.28   
2           3  walm067  haltom city      home and lifestyle     $46.33   
3           4  walm064      bedford       health and beauty     $58.22   
4           5  walm013       irving       sports and travel     $86.31   
5           6  walm026       denton  electronic accessories     $85.39   
6           7  walm088     cleburne  electronic accessories     $68.84   
7           8  walm100       canyon      home and lifestyle     $73.56   
8           9  walm066    grapevine       health and beauty     $36.26   
9          10  walm065   texas city      food and beverages     $54.84   

   quantity       date      time payment_method  rating  profit_margin  
0       7.0 2019-01-05  13:08:00        ewallet     9.1           0.48  
1       5.0 2019-03-08  10:29:00       

In [12]:
# Remove duplicates
before = df.shape[0]
df = df.drop_duplicates()
after = df.shape[0]
print(before - after)

51


In [13]:
# Check for any duplicate rows
print(df.duplicated().sum())
duplicates = df[df.duplicated()]
print(duplicates)


0
Empty DataFrame
Columns: [invoice_id, branch, city, category, unit_price, quantity, date, time, payment_method, rating, profit_margin]
Index: []


In [14]:
# Remove dollar signs, commas, and any spaces
df['unit_price'] = df['unit_price'].astype(str).str.replace(r'[\$,]', '', regex=True).str.strip()

In [15]:
# Convert numeric fields
num_cols = ['quantity', 'unit_price', 'total']
for col in num_cols:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    else:
        print(col)

total


In [16]:
# Compute total
if {'quantity', 'unit_price'}.issubset(df.columns):
    df['total'] = df['quantity'] * df['unit_price']
    
df[['quantity', 'unit_price', 'total']].head(10)

Unnamed: 0,quantity,unit_price,total
0,7.0,74.69,522.83
1,5.0,15.28,76.4
2,7.0,46.33,324.31
3,8.0,58.22,465.76
4,7.0,86.31,604.17
5,7.0,85.39,597.73
6,6.0,68.84,413.04
7,10.0,73.56,735.6
8,2.0,36.26,72.52
9,3.0,54.84,164.52


In [17]:
# Final validation
print(df.info())
print(df.isnull().sum())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
Index: 10000 entries, 0 to 9999
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   invoice_id      10000 non-null  int64         
 1   branch          10000 non-null  object        
 2   city            10000 non-null  object        
 3   category        10000 non-null  object        
 4   unit_price      10000 non-null  float64       
 5   quantity        10000 non-null  float64       
 6   date            10000 non-null  datetime64[ns]
 7   time            10000 non-null  object        
 8   payment_method  10000 non-null  object        
 9   rating          10000 non-null  float64       
 10  profit_margin   10000 non-null  float64       
 11  total           10000 non-null  float64       
dtypes: datetime64[ns](1), float64(5), int64(1), object(5)
memory usage: 1015.6+ KB
None
invoice_id        0
branch            0
city              0
category          0


In [18]:
df['branch'].unique()

array(['walm003', 'walm048', 'walm067', 'walm064', 'walm013', 'walm026',
       'walm088', 'walm100', 'walm066', 'walm065', 'walm035', 'walm027',
       'walm031', 'walm008', 'walm029', 'walm061', 'walm053', 'walm044',
       'walm083', 'walm023', 'walm001', 'walm072', 'walm046', 'walm012',
       'walm075', 'walm076', 'walm098', 'walm021', 'walm096', 'walm022',
       'walm095', 'walm041', 'walm070', 'walm030', 'walm033', 'walm079',
       'walm042', 'walm036', 'walm056', 'walm058', 'walm015', 'walm063',
       'walm089', 'walm093', 'walm045', 'walm005', 'walm078', 'walm037',
       'walm069', 'walm087', 'walm020', 'walm007', 'walm017', 'walm094',
       'walm071', 'walm090', 'walm097', 'walm043', 'walm011', 'walm049',
       'walm040', 'walm034', 'walm014', 'walm004', 'walm057', 'walm055',
       'walm038', 'walm025', 'walm024', 'walm085', 'walm016', 'walm018',
       'walm068', 'walm002', 'walm062', 'walm052', 'walm047', 'walm054',
       'walm099', 'walm074', 'walm028', 'walm019', 

In [19]:
df['category'].unique()

array(['health and beauty', 'electronic accessories',
       'home and lifestyle', 'sports and travel', 'food and beverages',
       'fashion accessories'], dtype=object)

In [20]:
df.to_csv("walmart_cleaned.csv", index=False)
df.to_excel("walmart_cleaned.xlsx", index=False)