In [86]:
import pandas as pd
import numpy as np
import glob
import os

## 1. Concatenate all input sales files into single csv file and export

In [87]:
files = glob.glob('./01_sales/input/*.csv')

# Method 1
# dfs = [pd.read_csv(x)for x in files]
# all_data = pd.concat(dfs, ignore_index=True)
# all_data.to_csv('./01_sales/output/all_data.csv')

# Method 2
dfs = [pd.read_csv(x) for x in files]
all_data = pd.DataFrame(np.concatenate([x.values for x in dfs]), columns=dfs[0].columns)

## 2. Data cleanup (remove NA rows, duplicated headers)

In [88]:
# Show unique values
display(all_data['Quantity Ordered'].unique())
# Count unique values
display(all_data['Quantity Ordered'].value_counts())
# Count NaN values
display(len(all_data['Quantity Ordered']) - all_data['Quantity Ordered'].count())

array(['2', nan, '1', '3', '5', 'Quantity Ordered', '4', '7', '6', '8',
       '9'], dtype=object)

1                   168552
2                    13324
3                     2920
4                      806
Quantity Ordered       355
5                      236
6                       80
7                       24
8                        5
9                        3
Name: Quantity Ordered, dtype: int64

545

In [89]:
# Remove NA values
all_data = all_data.dropna(how='all')
# Count NaN values
display(len(all_data['Quantity Ordered']) - all_data['Quantity Ordered'].count())

0

In [90]:
# Remove Duplicated headers
headers = all_data.columns
all_data = all_data[~all_data['Quantity Ordered'].str.contains('Quantity')]
display(all_data['Quantity Ordered'].unique())

array(['2', '1', '3', '5', '4', '7', '6', '8', '9'], dtype=object)

## 3. Export clean data to csv file

In [61]:
all_data.to_csv('./01_sales/output/all_data.csv', index=False)

## 4. Convert Column data type

In [127]:
all_data = pd.read_csv('./01_sales/output/all_data.csv')

# Convert columns to numeric type
all_data[['Order ID', 'Quantity Ordered', 'Price Each']] = all_data[['Order ID', 'Quantity Ordered', 'Price Each']].apply(pd.to_numeric)

# Convert columns to string type (object refers to string type)
all_data[['Product', 'Purchase Address']] = all_data[['Product', 'Purchase Address']].astype(str)

# Convert column to datetime type
# Method 1
# all_data['Order Date'] = all_data['Order Date'].astype('datetime64[ns]')

# Method 2
all_data['Order Date'] = pd.to_datetime(all_data['Order Date'])
display(all_data.dtypes)

Order ID                     int64
Product                     object
Quantity Ordered             int64
Price Each                 float64
Order Date          datetime64[ns]
Purchase Address            object
dtype: object