In [1]:
from datetime import datetime as dt, date
from dateutil.relativedelta import relativedelta


class Person:
    
    def __init__(self, name: str, dob: str) -> None:
        
        self.name = name
        self._dob = dt.strptime(dob, "%Y-%m-%d")
        
        
    def show_my_age(self) -> int:
        
        print(relativedelta(
            date.today(), self._dob
        ).years)
        
p = Person('Duc', '1996-04-02')

p.show_my_age()

26


## Datetime
- Get current date
- Modify current date
- Format date strings
- Get epoch and format epoch from date strings

In [2]:
import time
import pytz
from datetime import datetime, timedelta


# Timezone
TIMEZONE = pytz.timezone('Asia/Ho_Chi_Minh')

# Get now UTC
current_utc = datetime.utcnow()


# Get now in Ho Chi Minh
current_local = datetime.now(TIMEZONE)
print(f"current_local: {current_local}\n")

# Modify now by minus one hour
datetime_now_minus_one_hour = current_local - timedelta(hours=1)
print(f"datetime_now_minus_one_hour: {datetime_now_minus_one_hour}\n")

# Delete decimal from datetime
current_local_reformat = current_local.strftime("%Y-%m-%d %H:%M:%S\n")
print(f"current_local_reformat: {current_local_reformat}")

# Get epoch from datetime
epoch = current_local.timestamp()
print(f"epoch: {epoch}\n")

# Turn epoch to datetime
datetime_from_epoch = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(epoch))
print(f"datetime_from_epoch: {datetime_from_epoch}\n")

current_local: 2022-07-09 14:10:04.224600+07:00

datetime_now_minus_one_hour: 2022-07-09 13:10:04.224600+07:00

current_local_reformat: 2022-07-09 14:10:04

epoch: 1657350604.2246

datetime_from_epoch: 2022-07-09 14:10:04



## JSON
- Load JSON from a file
- Create JSON object from a string
- Dump JSON object to a string
- Loop through the JSON object
- Getting a specific data by value from JSON
- Update a part of JSON
- Write JSON to a file
- JSON validation

In [40]:
import os
import json
from os import path as PATH

# Third-party modules
from pydantic import BaseModel, validator

from typing import Optional
CWD = os.getcwd()

class MyModel(BaseModel):
    
    InvoiceNo: str
    StockCode: int
    Description: str
    Quantity: int
    InvoiceDate: Optional[str] = None
    UnitPrice: float
    CustomerID: int
    Country: str
    
    @validator("InvoiceDate")
    def validate_date_time(cls, v):
        
        try:
            _x = datetime.strptime(v, "%d/%m/%Y %H:%M")
        except:
            print(v)
        
            
            
        return _x
        
with open(PATH.join(CWD, "data/my_json.json")) as f:
    
    e_commerce_data = json.loads(f.read())
    print(e_commerce_data)
objs = map(lambda x: MyModel(**x), e_commerce_data)

[{'InvoiceNo': 536370, 'StockCode': 22492, 'Description': 'MINI PAINT SET VINTAGE', 'Quantity': 36, 'InvoiceDate': '12/1/2010 8:45', 'UnitPrice': 0.65, 'CustomerID': 12583, 'Country': 'France'}, {'InvoiceNo': 536372, 'StockCode': 22632, 'Description': 'HAND WARMER RED POLKA DOT', 'Quantity': 6, 'InvoiceDate': '12/1/2010 9:01', 'UnitPrice': 1.85, 'CustomerID': 17850, 'Country': 'United Kingdom'}]


In [42]:
import os
import pandas as pd

CWD = os.getcwd()

data_file_path = os.path.join(CWD, "data/data.csv")

df = pd.read_csv(data_file_path, encoding='unicode_escape', nrows=1000)

## Basic operators  

In [5]:
# Show columns
print(df.columns)

# show head 10 rows in dv
print(df.head(10))

# Show data type of each column
print(df.dtypes)


# Cast pandas object dtype via dictionary

tmp_dtyp_change_df = df.astype({
    "Quantity": "float"
})
print(tmp_dtyp_change_df.dtypes)

Index(['InvoiceNo', 'StockCode', 'Description', 'Quantity', 'InvoiceDate',
       'UnitPrice', 'CustomerID', 'Country'],
      dtype='object')
  InvoiceNo StockCode                          Description  Quantity  \
0    536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1    536365     71053                  WHITE METAL LANTERN         6   
2    536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3    536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4    536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
5    536365     22752         SET 7 BABUSHKA NESTING BOXES         2   
6    536365     21730    GLASS STAR FROSTED T-LIGHT HOLDER         6   
7    536366     22633               HAND WARMER UNION JACK         6   
8    536366     22632            HAND WARMER RED POLKA DOT         6   
9    536367     84879        ASSORTED COLOUR BIRD ORNAMENT        32   

      InvoiceDate  UnitPrice  CustomerID         Country  
0  12

## Concatenating and merging dataframe

In [6]:
additional_df = pd.DataFrame(e_commerce_data)

# Concatenating
print(f"Original data with {len(df)} rows + additional data with {len(additional_df)} rows = \
      {len(df) + len(additional_df)} rows")
      
concat_df = pd.concat([df, pd.DataFrame(e_commerce_data)], ignore_index=True, verify_integrity=True)
concat_df


# Merging
merged_df = df.merge(additional_df, on=['Country'])
merged_df


# Transform whole values of a column
df["InvoiceDate"] = pd.to_datetime(df["InvoiceDate"])
df.head(100)

# Drop columns
drop_column_df = df.drop(["InvoiceNo"], axis="columns")
drop_column_df = df.drop([1], axis="rows")

drop_column_df

Original data with 1000 rows + additional data with 2 rows =       1002 rows


Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
995,536520,22469,HEART OF WICKER SMALL,1,2010-12-01 12:43:00,1.65,14729.0,United Kingdom
996,536520,22100,SKULLS SQUARE TISSUE BOX,1,2010-12-01 12:43:00,1.25,14729.0,United Kingdom
997,536520,22096,PINK PAISLEY SQUARE TISSUE BOX,1,2010-12-01 12:43:00,1.25,14729.0,United Kingdom
998,536520,22583,PACK OF 6 HANDBAG GIFT BOXES,1,2010-12-01 12:43:00,2.55,14729.0,United Kingdom


## Normalizing, Lambdas, Pivot

In [7]:
df["UnitPrice"] = df["UnitPrice"].apply(lambda x: x * 100)


print(df)
cols_to_normalize = ["Quantity", "UnitPrice"]

def absolute_maximum_scale(series):
    return series / series.abs().max()


for col in cols_to_normalize:
    df[col] = absolute_maximum_scale(df[col])

df

    InvoiceNo StockCode                          Description  Quantity  \
0      536365    85123A   WHITE HANGING HEART T-LIGHT HOLDER         6   
1      536365     71053                  WHITE METAL LANTERN         6   
2      536365    84406B       CREAM CUPID HEARTS COAT HANGER         8   
3      536365    84029G  KNITTED UNION FLAG HOT WATER BOTTLE         6   
4      536365    84029E       RED WOOLLY HOTTIE WHITE HEART.         6   
..        ...       ...                                  ...       ...   
995    536520     22469                HEART OF WICKER SMALL         1   
996    536520     22100             SKULLS SQUARE TISSUE BOX         1   
997    536520     22096      PINK PAISLEY SQUARE TISSUE BOX          1   
998    536520     22583         PACK OF 6 HANDBAG GIFT BOXES         1   
999    536520     21358           TOAST ITS - HAPPY BIRTHDAY         2   

            InvoiceDate  UnitPrice  CustomerID         Country  
0   2010-12-01 08:26:00      255.0     17850.0

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,0.010000,2010-12-01 08:26:00,0.015455,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,0.010000,2010-12-01 08:26:00,0.020545,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,0.013333,2010-12-01 08:26:00,0.016667,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,0.010000,2010-12-01 08:26:00,0.020545,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,0.010000,2010-12-01 08:26:00,0.020545,17850.0,United Kingdom
...,...,...,...,...,...,...,...,...
995,536520,22469,HEART OF WICKER SMALL,0.001667,2010-12-01 12:43:00,0.010000,14729.0,United Kingdom
996,536520,22100,SKULLS SQUARE TISSUE BOX,0.001667,2010-12-01 12:43:00,0.007576,14729.0,United Kingdom
997,536520,22096,PINK PAISLEY SQUARE TISSUE BOX,0.001667,2010-12-01 12:43:00,0.007576,14729.0,United Kingdom
998,536520,22583,PACK OF 6 HANDBAG GIFT BOXES,0.001667,2010-12-01 12:43:00,0.015455,14729.0,United Kingdom


In [43]:
import math
df["unique_id"] = df["InvoiceNo"].astype("str") + df["StockCode"] + df["CustomerID"].astype("str")
df.drop_duplicates(subset=["unique_id"], inplace=True)
pivot_df = df\
            .filter(
                items=["unique_id", "UnitPrice", "Country"]
            ).pivot(
                index="unique_id",
                columns=["Country"],
                values="UnitPrice"
            ).reset_index()


pivot_df.to_parquet("data/ecommerce_pivot.parquet.gzip", compression="gzip")


In [9]:
parquet_df = pd.read_parquet("ecommerce_pivot.parquet.gzip")

parquet_df

Country,unique_id,Australia,France,Netherlands,United Kingdom
0,5363652173017850.0,,,,0.025758
1,5363652275217850.0,,,,0.046364
2,5363657105317850.0,,,,0.020545
3,53636584029E17850.0,,,,0.020545
4,53636584029G17850.0,,,,0.020545
...,...,...,...,...,...
940,C5363912198417548.0,,,,0.001758
941,C5363912255317548.0,,,,0.010000
942,C5363912255617548.0,,,,0.010000
943,C5363912255717548.0,,,,0.010000


## JSON Normalize and Melting

In [15]:
df.melt(
    id_vars=["unique_id"],
    value_vars=["Country"],
    var_name="my_var",
    value_name="my_values"
)

Unnamed: 0,unique_id,my_var,my_values
0,53636585123A17850.0,Country,United Kingdom
1,5363657105317850.0,Country,United Kingdom
2,53636584406B17850.0,Country,United Kingdom
3,53636584029G17850.0,Country,United Kingdom
4,53636584029E17850.0,Country,United Kingdom
...,...,...,...
940,5365202209214729.0,Country,United Kingdom
941,5365202246914729.0,Country,United Kingdom
942,5365202210014729.0,Country,United Kingdom
943,5365202209614729.0,Country,United Kingdom


In [35]:
nested_json =  [{
 		"id": "193x12",
 		"quantity": 10,
 		"item": {
 			"sku": "112x",
 			"in_stock": True,
            "attr": {
                "color": "red"
            }
 		},
        "vouchers": [
            {
                "name": "a"
            },
            {
                "name": "b"
            }
        ]
 	},
 	{
 		"id": "193x13",
 		"quantity": 10,
 		"item": {
 			"sku": "112x",
 			"in_stock": True
 		}
 	}
 ]

json_normalize_df = pd.json_normalize(nested_json)
json_normalize_df.explode(column="vouchers")

Unnamed: 0,id,quantity,vouchers,item.sku,item.in_stock,item.attr.color
0,193x12,10,{'name': 'a'},112x,True,red
0,193x12,10,{'name': 'b'},112x,True,red
1,193x13,10,,112x,True,


## Numpy

In [38]:
df["Quantity"].to_numpy().mean()

0.022246913580246913