In [2]:
import numpy as np
import pandas as pd

In [3]:
# read employees_astyle.csv data (converted data type)
emp_df = pd.read_csv('data/employees_astyle.csv')
emp_df.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17 00:00:00,AD_PRES,24000,0.0,0,90
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21 00:00:00,AD_VP,17000,0.0,100,90
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13 00:00:00,AD_VP,17000,0.0,100,90
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03 00:00:00,IT_PROG,9000,0.0,102,60
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21 00:00:00,IT_PROG,6000,0.0,103,60


In [4]:
emp_df.dtypes

EMPLOYEE_ID         int64
FIRST_NAME         object
LAST_NAME          object
EMAIL              object
PHONE_NUMBER       object
HIRE_DATE          object
JOB_ID             object
SALARY              int64
COMMISSION_PCT    float64
MANAGER_ID          int64
DEPARTMENT_ID       int64
dtype: object

In [5]:
# Convert time-sensitive data to DateTime data
emp_df['HIRE_DATE'] = pd.to_datetime(emp_df['HIRE_DATE'])
emp_df.head()

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID
0,100,Steven,King,SKING,515.123.4567,2003-06-17,AD_PRES,24000,0.0,0,90
1,101,Neena,Kochhar,NKOCHHAR,515.123.4568,2005-09-21,AD_VP,17000,0.0,100,90
2,102,Lex,De Haan,LDEHAAN,515.123.4569,2001-01-13,AD_VP,17000,0.0,100,90
3,103,Alexander,Hunold,AHUNOLD,590.423.4567,2006-01-03,IT_PROG,9000,0.0,102,60
4,104,Bruce,Ernst,BERNST,590.423.4568,2007-05-21,IT_PROG,6000,0.0,103,60


In [6]:
emp_df.dtypes

EMPLOYEE_ID                int64
FIRST_NAME                object
LAST_NAME                 object
EMAIL                     object
PHONE_NUMBER              object
HIRE_DATE         datetime64[ns]
JOB_ID                    object
SALARY                     int64
COMMISSION_PCT           float64
MANAGER_ID                 int64
DEPARTMENT_ID              int64
dtype: object

In [7]:
# Extract components of time data
emp_df['HIRE_YEAR'] = emp_df['HIRE_DATE'].dt.year
emp_df['HIRE_MONTH'] = emp_df['HIRE_DATE'].dt.month
emp_df['HIRE_DAY'] = emp_df['HIRE_DATE'].dt.day
emp_df[['HIRE_DATE', 'HIRE_YEAR', 'HIRE_MONTH', 'HIRE_DAY']].head()

Unnamed: 0,HIRE_DATE,HIRE_YEAR,HIRE_MONTH,HIRE_DAY
0,2003-06-17,2003,6,17
1,2005-09-21,2005,9,21
2,2001-01-13,2001,1,13
3,2006-01-03,2006,1,3
4,2007-05-21,2007,5,21


In [8]:
emp_df['HIRE_DATE'].dt.isocalendar().week

0      25
1      38
2       2
3       1
4      21
       ..
102    33
103    23
104    23
105    23
106    23
Name: week, Length: 107, dtype: UInt32

In [9]:
emp_df['HIRE_DATE'].dt.dayofweek

0      1
1      2
2      5
3      1
4      0
      ..
102    2
103    4
104    4
105    4
106    4
Name: HIRE_DATE, Length: 107, dtype: int64

In [10]:
emp_df['HIRE_DATE'].dt.day_name()

0        Tuesday
1      Wednesday
2       Saturday
3        Tuesday
4         Monday
         ...    
102    Wednesday
103       Friday
104       Friday
105       Friday
106       Friday
Name: HIRE_DATE, Length: 107, dtype: object

In [11]:
emp_df['HIRE_DATE'].dt.is_leap_year

0      False
1      False
2      False
3      False
4      False
       ...  
102    False
103    False
104    False
105    False
106    False
Name: HIRE_DATE, Length: 107, dtype: bool

In [12]:
emp_df[emp_df['HIRE_DATE'].dt.is_leap_year]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID,HIRE_YEAR,HIRE_MONTH,HIRE_DAY
20,120,Matthew,Weiss,MWEISS,650.123.1234,2004-07-18,ST_MAN,8000,0.0,100,50,2004,7,18
28,128,Steven,Markle,SMARKLE,650.124.1434,2008-03-08,ST_CLERK,2200,0.0,120,50,2008,3,8
33,133,Jason,Mallin,JMALLIN,650.127.1934,2004-06-14,ST_CLERK,3300,0.0,122,50,2004,6,14
36,136,Hazel,Philtanker,HPHILTAN,650.127.1634,2008-02-06,ST_CLERK,2200,0.0,122,50,2008,2,6
45,145,John,Russell,JRUSSEL,011.44.1344.429268,2004-10-01,SA_MAN,14000,0.4,100,80,2004,10,1
49,149,Eleni,Zlotkey,EZLOTKEY,011.44.1344.429018,2008-01-29,SA_MAN,10500,0.2,100,80,2008,1,29
56,156,Janette,King,JKING,011.44.1345.429268,2004-01-30,SA_REP,10000,0.35,146,80,2004,1,30
57,157,Patrick,Sully,PSULLY,011.44.1345.929268,2004-03-04,SA_REP,9500,0.35,146,80,2004,3,4
58,158,Allan,McEwen,AMCEWEN,011.44.1345.829268,2004-08-01,SA_REP,9000,0.35,146,80,2004,8,1
64,164,Mattea,Marvins,MMARVINS,011.44.1346.329268,2008-01-24,SA_REP,7200,0.1,147,80,2008,1,24


In [13]:
ser = pd.Series(['2016-16-10 20:30:0', 
                '2016-27-1 19:45:30', 
                '2013-10-12 4:5:1'])
ser = pd.to_datetime(ser, format="%Y-%d-%m %H:%M:%S")
ser

0   2016-10-16 20:30:00
1   2016-01-27 19:45:30
2   2013-12-10 04:05:01
dtype: datetime64[ns]

In [14]:
ser.dt.date

0    2016-10-16
1    2016-01-27
2    2013-12-10
dtype: object

In [15]:
# Format
ser = ser.dt.strftime("%d/%m/%Y")
ser

0    16/10/2016
1    27/01/2016
2    10/12/2013
dtype: object

In [None]:
# Distance between timestamps: timedelta()

In [16]:
thu_vien = pd.DataFrame({'ngay_muon': ['2023-01-03', '2023-02-06', '2023-02-28']})
thu_vien

Unnamed: 0,ngay_muon
0,2023-01-03
1,2023-02-06
2,2023-02-28


In [17]:
thu_vien.ngay_muon = pd.to_datetime(thu_vien.ngay_muon)
thu_vien

Unnamed: 0,ngay_muon
0,2023-01-03
1,2023-02-06
2,2023-02-28


In [18]:
from datetime import timedelta

In [19]:
thu_vien['ngay_het_han'] = thu_vien.ngay_muon + timedelta(days = 10)
thu_vien

Unnamed: 0,ngay_muon,ngay_het_han
0,2023-01-03,2023-01-13
1,2023-02-06,2023-02-16
2,2023-02-28,2023-03-10


In [20]:
thu_vien['ngay_tra'] = pd.to_datetime(pd.Series(['2023-01-10', '2023-02-17', '2023-02-28']))
thu_vien

Unnamed: 0,ngay_muon,ngay_het_han,ngay_tra
0,2023-01-03,2023-01-13,2023-01-10
1,2023-02-06,2023-02-16,2023-02-17
2,2023-02-28,2023-03-10,2023-02-28


In [21]:
thu_vien['tra_cham'] = thu_vien.ngay_tra > thu_vien.ngay_het_han
thu_vien

Unnamed: 0,ngay_muon,ngay_het_han,ngay_tra,tra_cham
0,2023-01-03,2023-01-13,2023-01-10,False
1,2023-02-06,2023-02-16,2023-02-17,True
2,2023-02-28,2023-03-10,2023-02-28,False


In [None]:
# Convert Timestamp to datetime

In [22]:
ratings = pd.read_excel('data/movies.xlsx', 'ratings')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [23]:
ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [24]:
# Create column 'parsed_time' is the data of column 'timestamp' converted to datetime
ratings['parsed_time'] = pd.to_datetime(ratings.timestamp, unit='s')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp,parsed_time
0,1,31,2.5,1260759144,2009-12-14 02:52:24
1,1,1029,3.0,1260759179,2009-12-14 02:52:59
2,1,1061,3.0,1260759182,2009-12-14 02:53:02
3,1,1129,2.0,1260759185,2009-12-14 02:53:05
4,1,1172,4.0,1260759205,2009-12-14 02:53:25


In [25]:
ratings.dtypes

userId                  int64
movieId                 int64
rating                float64
timestamp               int64
parsed_time    datetime64[ns]
dtype: object

In [26]:
# Employees hired in June 2006
emp_df[(emp_df['HIRE_DATE'] >= '2006-06-01') & (emp_df['HIRE_DATE'] <= '2006-06-30')]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID,HIRE_YEAR,HIRE_MONTH,HIRE_DAY
86,186,Julia,Dellinger,JDELLING,650.509.3876,2006-06-24,SH_CLERK,3400,0.0,121,50,2006,6,24


In [27]:
# Employees hired in the last 6 months of 2006
emp_df[(emp_df['HIRE_DATE'] >= '2006-10') & (emp_df['HIRE_DATE'] < '2007')]

Unnamed: 0,EMPLOYEE_ID,FIRST_NAME,LAST_NAME,EMAIL,PHONE_NUMBER,HIRE_DATE,JOB_ID,SALARY,COMMISSION_PCT,MANAGER_ID,DEPARTMENT_ID,HIRE_YEAR,HIRE_MONTH,HIRE_DAY
18,118,Guy,Himuro,GHIMURO,515.127.4565,2006-11-15,PU_CLERK,2600,0.0,114,30,2006,11,15
54,154,Nanette,Cambrault,NCAMBRAU,011.44.1344.987668,2006-12-09,SA_REP,7500,0.2,145,80,2006,12,9
61,161,Sarath,Sewall,SSEWALL,011.44.1345.529268,2006-11-03,SA_REP,7000,0.25,146,80,2006,11,3
