In [None]:
import pandas as pd
import numpy as np

# 원본 데이터 불러오기
amazon_raw = pd.read_csv("../01_data/01_raw/amazon_delivery_raw.csv")
amazon_raw.head()

In [14]:
# 필요없는 열 제거
df = amazon_raw.drop(columns = ["Order_ID", "Order_Date"])
df.head()

Unnamed: 0,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category
0,37,4.9,22.745049,75.892471,22.765049,75.912471,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing
1,34,4.5,12.913041,77.683237,13.043041,77.813237,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics
2,23,4.4,12.914264,77.6784,12.924264,77.6884,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports
3,38,4.7,11.003669,76.976494,11.053669,77.026494,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics
4,32,4.6,12.972793,80.249982,13.012793,80.289982,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys


In [15]:
# 위도와 경도를 인자로 받는 함수 하버사인 정의
def haversine(lat1, lon1, lat2, lon2):
    
    # 삼각함수 sin, cos은 라디안만 받으므로 위도와 경도의 값을 라디안으로 변환
    lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])

    # 두 점 사이의 위도, 경도 차이 계산
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # 거리 = 반지름 * 각도인데 위도 경도로는 각도를 알 수 없으므로 위도 경도만으로 각도를 알 수 있게 하기 위한 중간 값 계산
    a = (np.sin(dlat/2) ** 2 + np.cos(lat1) * np.cos(lat2) *np.sin(dlon/2)**2)

    # a = sin^2(theta/2)이므로 theta = 2 * arcsin(sqrt(a))
    theta = 2 * np.arcsin(np.sqrt(a))

    # 지구 반지름
    R = 6371

    # 거리 = 반지름 * 각도
    return R * theta
    
# 각 행마다 출발지와 도착지의 위도, 경도를 가져와 하버사인 공식을 이용해 두 지점 사이의 거리를 계산
# axis = 1 이므로 행 기준으로 함수가 적용됨
df["Distance"] = df.apply(lambda row: haversine(row["Store_Latitude"], row["Store_Longitude"], row["Drop_Latitude"], row["Drop_Longitude"]), 
axis = 1)
df.head()

Unnamed: 0,Agent_Age,Agent_Rating,Store_Latitude,Store_Longitude,Drop_Latitude,Drop_Longitude,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Distance
0,37,4.9,22.745049,75.892471,22.765049,75.912471,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing,3.025149
1,34,4.5,12.913041,77.683237,13.043041,77.813237,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics,20.18353
2,23,4.4,12.914264,77.6784,12.924264,77.6884,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports,1.552758
3,38,4.7,11.003669,76.976494,11.053669,77.026494,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,7.790401
4,32,4.6,12.972793,80.249982,13.012793,80.289982,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys,6.210138


In [16]:
# 위치 정보 계산이 완료되었으므로 위도와 경도 열 제거
df = df.drop(columns = ["Store_Latitude", "Store_Longitude", "Drop_Latitude", "Drop_Longitude"])
df.head()

Unnamed: 0,Agent_Age,Agent_Rating,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Distance
0,37,4.9,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing,3.025149
1,34,4.5,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics,20.18353
2,23,4.4,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports,1.552758
3,38,4.7,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,7.790401
4,32,4.6,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys,6.210138


In [17]:
# 문자열로 된 주문 시각과 픽업 시각을 datetime 형식으로 변환
# (시:분:초 포맷, 변환 불가한 값은 NaT 처리)
df["Order_Time_dt"] = pd.to_datetime(df["Order_Time"], format="%H:%M:%S", errors="coerce")
df["Pickup_Time_dt"] = pd.to_datetime(df["Pickup_Time"], format="%H:%M:%S", errors="coerce")

# 하루를 0~1439 범위의 분 단위 숫자로 변환 [시각 = (시 * 60 + 분)]
df["Order_Minute"] = (df["Order_Time_dt"].dt.hour * 60 + df["Order_Time_dt"].dt.minute)
df["Pickup_Minute"] = (df["Pickup_Time_dt"].dt.hour * 60 + df["Pickup_Time_dt"].dt.minute)

# 픽업 분 - 주문 분으로 픽업 대기 시간 계산
df["Pickup_Wait_Time"] = df["Pickup_Minute"] - df["Order_Minute"]

# 자정을 넘긴 경우(음수 값)는 다음 날로 간주하여 1440분 보정
df.loc[df["Pickup_Wait_Time"] < 0, "Pickup_Wait_Time"] += 1440
df.head()

Unnamed: 0,Agent_Age,Agent_Rating,Order_Time,Pickup_Time,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Distance,Order_Time_dt,Pickup_Time_dt,Order_Minute,Pickup_Minute,Pickup_Wait_Time
0,37,4.9,11:30:00,11:45:00,Sunny,High,motorcycle,Urban,120,Clothing,3.025149,1900-01-01 11:30:00,1900-01-01 11:45:00,690.0,705,15.0
1,34,4.5,19:45:00,19:50:00,Stormy,Jam,scooter,Metropolitian,165,Electronics,20.18353,1900-01-01 19:45:00,1900-01-01 19:50:00,1185.0,1190,5.0
2,23,4.4,08:30:00,08:45:00,Sandstorms,Low,motorcycle,Urban,130,Sports,1.552758,1900-01-01 08:30:00,1900-01-01 08:45:00,510.0,525,15.0
3,38,4.7,18:00:00,18:10:00,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,7.790401,1900-01-01 18:00:00,1900-01-01 18:10:00,1080.0,1090,10.0
4,32,4.6,13:30:00,13:45:00,Cloudy,High,scooter,Metropolitian,150,Toys,6.210138,1900-01-01 13:30:00,1900-01-01 13:45:00,810.0,825,15.0


In [18]:
# 개인 정보가 들어있는 데이터 전처리 완료
df = df.drop(columns = ["Order_Time", "Pickup_Time", "Order_Time_dt", "Pickup_Time_dt", "Order_Minute", "Pickup_Minute"])
df.head()

Unnamed: 0,Agent_Age,Agent_Rating,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Distance,Pickup_Wait_Time
0,37,4.9,Sunny,High,motorcycle,Urban,120,Clothing,3.025149,15.0
1,34,4.5,Stormy,Jam,scooter,Metropolitian,165,Electronics,20.18353,5.0
2,23,4.4,Sandstorms,Low,motorcycle,Urban,130,Sports,1.552758,15.0
3,38,4.7,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,7.790401,10.0
4,32,4.6,Cloudy,High,scooter,Metropolitian,150,Toys,6.210138,15.0


In [19]:
# 범주형 변수 정리를 위해 문자열 양쪽 공백 제거
# (공백 때문에 동일 범주가 다른 값으로 인식되는 것 방지)
df["Traffic"] = df["Traffic"].astype(str).str.strip()
df["Weather"] = df["Weather"].astype(str).str.strip()
df["Vehicle"] = df["Vehicle"].astype(str).str.strip()
df["Area"] = df["Area"].astype(str).str.strip()
df["Category"] = df["Category"].astype(str).str.strip()

# 평점이 5 초과, 나이가 18 미만, 거리가 50 초과인 행 제거
df = df[df["Agent_Rating"] <= 5]
df = df[df["Agent_Age"] >= 18]
df = df[df["Distance"] < 50]

df["Distance"].describe()

count    43443.000000
mean         9.732966
std          5.604745
min          1.465067
25%          4.663412
50%          9.220148
75%         13.681416
max         20.969489
Name: Distance, dtype: float64

In [20]:
# 운송 수단 변수 원-핫 인코딩
# 기준 범주(motorcycle) 제거
vehicle_dummies = pd.get_dummies(df["Vehicle"], prefix="Vehicle")

# 범주형 변수 원-핫 인코딩 결과에서 기준 범주를 제거
# (모든 범주를 유지하면 각 행에서 더미 변수 합이 항상 1이 되어 하나의 컬럼이 나머지 컬럼들로 완전히 표현되는 다중공선성 발생)
vehicle_dummies = vehicle_dummies.drop(columns=["Vehicle_motorcycle"])

# 지역 유형 변수 원-핫 인코딩
# 기준 범주(Urban) 제거
area_dummies = pd.get_dummies(df["Area"], prefix="Area")
area_dummies = area_dummies.drop(columns=["Area_Urban"])

# 물품 유형 변수 원-핫 인코딩
# 기준 범주(Clothing) 제거
category_dummies = pd.get_dummies(df["Category"], prefix="Category")
category_dummies = category_dummies.drop(columns=["Category_Clothing"])

# 생성된 더미 변수를 행 기준으로 기존 데이터프레임에 병합
df = pd.concat([df, vehicle_dummies, area_dummies, category_dummies], axis = 1)
df.head()

Unnamed: 0,Agent_Age,Agent_Rating,Weather,Traffic,Vehicle,Area,Delivery_Time,Category,Distance,Pickup_Wait_Time,...,Category_Home,Category_Jewelry,Category_Kitchen,Category_Outdoors,Category_Pet Supplies,Category_Shoes,Category_Skincare,Category_Snacks,Category_Sports,Category_Toys
0,37,4.9,Sunny,High,motorcycle,Urban,120,Clothing,3.025149,15.0,...,False,False,False,False,False,False,False,False,False,False
1,34,4.5,Stormy,Jam,scooter,Metropolitian,165,Electronics,20.18353,5.0,...,False,False,False,False,False,False,False,False,False,False
2,23,4.4,Sandstorms,Low,motorcycle,Urban,130,Sports,1.552758,15.0,...,False,False,False,False,False,False,False,False,True,False
3,38,4.7,Sunny,Medium,motorcycle,Metropolitian,105,Cosmetics,7.790401,10.0,...,False,False,False,False,False,False,False,False,False,False
4,32,4.6,Cloudy,High,scooter,Metropolitian,150,Toys,6.210138,15.0,...,False,False,False,False,False,False,False,False,False,True


In [21]:
# 필요없는 열 제거
df = df.drop(columns = ["Vehicle", "Area", "Category"])

# 열 이름 형식 통일
df = df.rename(columns={"Area_Semi-Urban":"Area_Semi_Urban"})

# 인덱스 번호 맞추기
df = df.dropna().reset_index(drop=True)
df.head(50000)

Unnamed: 0,Agent_Age,Agent_Rating,Weather,Traffic,Delivery_Time,Distance,Pickup_Wait_Time,Vehicle_scooter,Vehicle_van,Area_Metropolitian,...,Category_Home,Category_Jewelry,Category_Kitchen,Category_Outdoors,Category_Pet Supplies,Category_Shoes,Category_Skincare,Category_Snacks,Category_Sports,Category_Toys
0,37,4.9,Sunny,High,120,3.025149,15.0,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,34,4.5,Stormy,Jam,165,20.183530,5.0,True,False,True,...,False,False,False,False,False,False,False,False,False,False
2,23,4.4,Sandstorms,Low,130,1.552758,15.0,False,False,False,...,False,False,False,False,False,False,False,False,True,False
3,38,4.7,Sunny,Medium,105,7.790401,10.0,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,32,4.6,Cloudy,High,150,6.210138,15.0,True,False,True,...,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43438,30,4.8,Windy,High,160,1.489846,10.0,False,False,True,...,True,False,False,False,False,False,False,False,False,False
43439,21,4.6,Windy,Jam,180,11.007735,15.0,False,False,True,...,False,True,False,False,False,False,False,False,False,False
43440,30,4.9,Cloudy,Low,80,4.657195,15.0,True,False,True,...,True,False,False,False,False,False,False,False,False,False
43441,20,4.7,Cloudy,High,130,6.232393,5.0,False,False,True,...,False,False,True,False,False,False,False,False,False,False


In [22]:
# 타깃 변수(Delivery_Time)를 분리하여 입력 변수(X)와 정답 변수(y) 생성
X = df.drop(columns=["Delivery_Time"])
y = df["Delivery_Time"]

In [23]:
from sklearn.model_selection import train_test_split

# 전체 데이터를 학습용(train)과 평가용(test)으로 분리
# 전체 데이터의 20%를 테스트용으로 사용하고(test_size=0.2) 데이터 분할 결과를 항상 동일하게 유지(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [24]:
# 학습 데이터 저장
X_train.to_csv("../01_data/02_processed/sandbox/splits/X_train.csv", index=False, encoding="utf-8-sig")
y_train.to_csv("../01_data/02_processed/sandbox/splits/y_train.csv", index=False, encoding="utf-8-sig")

# 평가 데이터 저장
X_test.to_csv("../01_data/02_processed/sandbox/splits/X_test.csv", index=False, encoding="utf-8-sig")
y_test.to_csv("../01_data/02_processed/sandbox/splits/y_test.csv", index=False, encoding="utf-8-sig")