# Clean the data 

### Import libraries

In [1]:
import pandas as pd
import numpy as np
print("Import Completed")

Import Completed


### Import the data 

In [2]:
df = pd.read_csv(r"./data_ecommerce.csv")

### Read the data

**1. Review first 5 rows**

In [3]:
print(df.head())

   Seller ID Seller's Main Category Sign-up Time Activation Time 1st Listing  \
0          1             Electronic    9/20/2017        2/2/2018    2/7/2018   
1          2                   BBFF    8/19/2017       5/24/2018   8/13/2018   
2          3                   BBFF    12/4/2017        5/4/2018   10/6/2018   
3          4                   BBFF   10/16/2017       4/23/2018    7/4/2018   
4          5                   BBFF   12/13/2017       7/25/2018   8/30/2018   

  1st Salable 1st Transaction  
0   10/6/2018      10/10/2018  
1   8/14/2018        9/2/2018  
2  10/10/2018      10/12/2018  
3   7/10/2018       7/10/2018  
4   8/31/2018       9/14/2018  


**2. View basic info**

In [4]:
print(df.shape)
print(df.info())

(2145, 7)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2145 entries, 0 to 2144
Data columns (total 7 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   Seller ID               2145 non-null   int64 
 1   Seller's Main Category  2145 non-null   object
 2   Sign-up Time            2145 non-null   object
 3   Activation Time         2145 non-null   object
 4   1st Listing             2145 non-null   object
 5   1st Salable             2145 non-null   object
 6   1st Transaction         2145 non-null   object
dtypes: int64(1), object(6)
memory usage: 117.4+ KB
None


**3. Check for missing values**

In [5]:
print(df.isnull().sum())

Seller ID                 0
Seller's Main Category    0
Sign-up Time              0
Activation Time           0
1st Listing               0
1st Salable               0
1st Transaction           0
dtype: int64


**4. Check for duplication**

In [6]:
dup = df.duplicated()
duprow = df[dup]
print(duprow)

Empty DataFrame
Columns: [Seller ID, Seller's Main Category, Sign-up Time, Activation Time, 1st Listing, 1st Salable, 1st Transaction]
Index: []


**5. View detailed information**

- Check for any strange values in "Seller's Main Category" column

In [7]:
print(df["Seller's Main Category"].unique())

['Electronic' 'BBFF' 'LifeStyle' 'Digital Service' 'Book']


- Count number of unique values in "Seller's Main Category" column

In [8]:
print(df["Seller's Main Category"].value_counts())

BBFF               1130
LifeStyle           501
Electronic          262
Book                130
Digital Service     122
Name: Seller's Main Category, dtype: int64


- Re-format the date-type in 5 columns ("Sign-up Time", "Activation Time", "1st Listing", "1st Salable", "1st Transaction")

In [9]:
for i in range(2145):
    for j in range(2, 7):
        tmp = "-"
        df.iloc[i, j] = tmp.join(df.iloc[i, j].split("/"))

- Check if there is any invalid date ( len(pos) = 0 &rarr; no invalid date)

In [10]:
pos = []
for i in range(2145):
    for j in range(2, 7):
        tmp = df.iloc[i, j].split("-")
        for z in range(3):
            a = int(tmp[z])
            tmp[z] = a
        if tmp[0] == 2:
            if tmp[1] > 28:
                pos.append([i, j])
        else:
            if (tmp[0] < 8 and tmp[0] % 2 == 1) or (tmp[0] > 7 and tmp[0] % 2 == 0):
                if tmp[1] > 31:
                    pos.append([i, j])
            else:
                if tmp[1] > 31:
                    pos.append([i, j])
print(len(pos))

0


### Export the data

In [11]:
df.to_csv("./dataEcommerceClean.csv", index = False)