# 장애인콜택시 대기시간 예측
## 단계1.데이터 전처리
## 단계2.데이터 분석
## 단계3.모델링 및 예측

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import joblib

# 더 필요한 라이브러리를 아래에 추가합니다.



### (3) 데이터 불러오기
* 주어진 데이터셋
    * 장애인 콜택시 운행 정보 : open_data.csv
    * 날씨 데이터 : weather.csv

#### 1) 데이터로딩

In [6]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
taxi=pd.read_csv('open_data.csv')
weather=pd.read_csv('weather.csv')

#### 2) 기본 정보 조회

In [7]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
# 전체 데이터의 행,열 개수 확인
print(taxi.shape)
print(weather.shape)

(2922, 7)
(4018, 7)


In [8]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
# 전체 데이터의 모든 변수 확인
taxi.head()

Unnamed: 0,기준일,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리
0,2015-01-01,213,1023,924,23.2,2427,10764
1,2015-01-02,420,3158,2839,17.2,2216,8611
2,2015-01-03,209,1648,1514,26.2,2377,10198
3,2015-01-04,196,1646,1526,24.5,2431,10955
4,2015-01-05,421,4250,3730,26.2,2214,8663


In [9]:
weather.head()

Unnamed: 0,Date,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2)
0,2012-01-01,0.4,-6.6,0.0,77.0,45.0,4.9
1,2012-01-02,-1.2,-8.3,0.0,80.0,48.0,6.16
2,2012-01-03,-0.4,-6.6,0.4,86.0,45.0,4.46
3,2012-01-04,-4.6,-9.5,0.0,66.0,38.0,8.05
4,2012-01-05,-1.4,-9.6,0.0,71.0,28.0,9.14


In [10]:
print(taxi.info())
print(weather.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2922 entries, 0 to 2921
Data columns (total 7 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   기준일     2922 non-null   object 
 1   차량운행    2922 non-null   int64  
 2   접수건     2922 non-null   int64  
 3   탑승건     2922 non-null   int64  
 4   평균대기시간  2922 non-null   float64
 5   평균요금    2922 non-null   int64  
 6   평균승차거리  2922 non-null   int64  
dtypes: float64(1), int64(5), object(1)
memory usage: 159.9+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4018 entries, 0 to 4017
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             4018 non-null   object 
 1   temp_max         4018 non-null   float64
 2   temp_min         4018 non-null   float64
 3   rain(mm)         4018 non-null   float64
 4   humidity_max(%)  4018 non-null   float64
 5   humidity_min(%)  4018 non-null   float64
 6   sunshine(MJ/m

#### 3) 칼럼이름을 영어로 변경
* 꼭 필요한 작업은 아니지만, 데이터를 편리하게 다루고 차트에서 불필요한 경고메시지를 띄우지 않게 하기 위해 영문으로 변경하기를 권장합니다.


In [11]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.



## 2.데이터 기본 탐색

* **세부요구사항**
    * 날짜 요소에 따라 각 정보의 패턴을 조회 합니다.
        * 일별, 요일별, 주차별, 월별, 연도별
        * 접수건, 탑승건, 거리, 요금, 대기시간 등
    * 제시된 범위 외에 가능하다면 추가 탐색을 시도합니다.

### (1) 주기별 분석을 위해서 날짜 변수 추가하기
* data를 복사합니다.
* 복사한 df에 요일, 주차, 월, 연도 등을 추가합니다.

In [12]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
taxi['기준일']=pd.to_datetime(taxi['기준일'])
weather['Date']=pd.to_datetime(weather['Date'])

In [13]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi=taxi.copy()
df_weather=weather.copy()

In [14]:
df_taxi['year']=df_taxi['기준일'].dt.year
df_taxi['month']=df_taxi['기준일'].dt.month
df_taxi['day']=df_taxi['기준일'].dt.day
df_taxi['weekday']=df_taxi['기준일'].dt.day_name()

In [15]:
df_taxi.tail()

Unnamed: 0,기준일,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,year,month,day,weekday
2917,2022-12-27,669,5635,4654,44.4,2198,8178,2022,12,27,Tuesday
2918,2022-12-28,607,5654,4648,44.8,2161,7882,2022,12,28,Wednesday
2919,2022-12-29,581,5250,4247,52.5,2229,8433,2022,12,29,Thursday
2920,2022-12-30,600,5293,4200,38.3,2183,8155,2022,12,30,Friday
2921,2022-12-31,263,2167,1806,33.7,2318,9435,2022,12,31,Saturday


### (2) 일별

* 차량 운행수

In [16]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('day',as_index=True)[['차량운행']].describe()

Unnamed: 0_level_0,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,96.0,388.375,144.066596,178.0,245.75,414.5,505.75,700.0
2,96.0,419.520833,132.769288,161.0,284.75,444.5,511.0,690.0
3,96.0,408.614583,129.551402,164.0,275.75,444.5,509.25,631.0
4,96.0,418.666667,134.711361,186.0,273.5,451.0,527.0,650.0
5,96.0,408.96875,134.806407,171.0,268.5,441.5,521.0,655.0
6,96.0,409.03125,130.300822,190.0,276.75,444.5,521.25,679.0
7,96.0,427.583333,133.118204,167.0,290.75,458.5,522.0,692.0
8,96.0,428.739583,134.107203,181.0,301.5,461.5,531.75,697.0
9,96.0,409.052083,132.819495,167.0,270.75,442.5,513.5,649.0
10,96.0,416.083333,134.381129,171.0,274.5,454.0,516.0,628.0


* 접수건, 탑승건

In [17]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('day',as_index=True)[['접수건','탑승건']].describe()

Unnamed: 0_level_0,접수건,접수건,접수건,접수건,접수건,접수건,접수건,접수건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1,96.0,3462.65625,1648.545948,527.0,1836.5,3926.5,5048.25,6051.0,96.0,2875.177083,1383.488603,462.0,1602.25,3146.0,4141.75,5129.0
2,96.0,3925.177083,1498.422026,1114.0,2156.75,4713.0,5100.0,5990.0,96.0,3277.520833,1241.608608,1048.0,1807.75,3918.0,4222.5,5025.0
3,96.0,3725.28125,1502.094718,591.0,2060.5,4519.0,4975.0,5715.0,96.0,3129.1875,1247.693538,551.0,1734.0,3806.0,4111.75,4742.0
4,96.0,3886.364583,1472.694953,967.0,2147.5,4543.0,5081.75,5948.0,96.0,3269.989583,1216.042074,908.0,1772.5,3863.5,4189.0,5038.0
5,96.0,3755.0,1591.98542,623.0,1978.5,4464.0,5084.25,5916.0,96.0,3153.09375,1302.277051,591.0,1670.25,3916.5,4129.5,4962.0
6,96.0,3740.760417,1590.899532,655.0,2003.75,4444.0,5124.25,6092.0,96.0,3148.364583,1320.859083,618.0,1670.25,3878.0,4228.5,5083.0
7,96.0,4030.0625,1498.98541,977.0,2165.0,4811.5,5131.5,5985.0,96.0,3377.666667,1239.775554,887.0,1774.25,3973.5,4332.25,4990.0
8,96.0,4058.052083,1467.594382,596.0,2358.0,4876.5,5122.5,5929.0,96.0,3399.833333,1215.058497,550.0,1859.5,4010.5,4249.75,5000.0
9,96.0,3770.041667,1516.003638,1076.0,2109.0,4663.5,5112.0,5892.0,96.0,3180.666667,1281.58653,957.0,1719.25,3966.5,4246.75,4939.0
10,96.0,3910.520833,1488.604108,624.0,2190.75,4748.0,5076.75,5735.0,96.0,3277.78125,1231.381588,586.0,1780.75,3976.0,4247.75,4817.0


* 대기시간

In [18]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('day',as_index=True)[['평균대기시간']].describe()

Unnamed: 0_level_0,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,96.0,37.095833,13.795323,17.7,26.1,34.2,45.0,88.2
2,96.0,37.564583,13.423436,17.2,27.75,35.2,45.125,73.9
3,96.0,36.615625,12.768311,18.2,26.725,33.1,43.475,82.0
4,96.0,38.382292,13.88867,18.4,27.825,35.5,44.9,82.4
5,96.0,38.445833,14.610976,18.2,27.775,34.3,47.375,80.8
6,96.0,36.551042,12.771976,19.1,27.525,33.85,44.775,72.7
7,96.0,40.040625,14.211504,18.3,31.325,38.0,46.325,96.1
8,96.0,41.313542,13.955023,18.9,31.55,39.8,48.25,90.2
9,96.0,38.325,13.06394,18.4,27.875,36.4,47.475,76.5
10,96.0,40.586458,14.250316,17.8,29.375,37.4,49.725,84.8


* 운임

In [19]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('day',as_index=True)[['평균요금']].describe()

Unnamed: 0_level_0,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,96.0,2325.78125,107.37972,2165.0,2241.5,2277.5,2419.0,2611.0
2,96.0,2290.927083,102.115131,2159.0,2218.75,2243.0,2373.25,2525.0
3,96.0,2312.458333,106.769986,2173.0,2232.25,2256.5,2422.5,2567.0
4,96.0,2302.8125,106.51971,2166.0,2227.5,2255.5,2395.25,2588.0
5,96.0,2309.729167,114.675146,2160.0,2224.0,2257.0,2411.75,2708.0
6,96.0,2307.6875,104.567008,2168.0,2228.0,2258.5,2411.0,2528.0
7,96.0,2294.854167,101.95014,2174.0,2227.75,2248.0,2396.25,2531.0
8,96.0,2299.104167,111.892663,2151.0,2219.0,2253.5,2362.25,2587.0
9,96.0,2306.854167,108.293848,2141.0,2222.75,2259.0,2399.25,2536.0
10,96.0,2304.322917,108.803589,2177.0,2224.0,2253.5,2387.0,2578.0


* 이동거리

In [20]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('day',as_index=True)[['평균승차거리']].describe()

Unnamed: 0_level_0,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,96.0,9481.916667,1015.129711,7912.0,8681.25,9096.0,10438.5,12064.0
2,96.0,9128.427083,934.621756,7863.0,8451.0,8769.0,9959.25,11092.0
3,96.0,9331.25,1016.939091,8074.0,8572.25,8857.5,10330.75,11747.0
4,96.0,9242.875,1011.352104,7942.0,8502.5,8799.0,10000.5,11837.0
5,96.0,9310.510417,1089.267054,7808.0,8506.5,8825.5,10244.0,12913.0
6,96.0,9294.28125,1001.189899,7979.0,8537.25,8849.0,10283.0,11479.0
7,96.0,9169.78125,994.621443,7972.0,8476.5,8748.5,10092.5,11526.0
8,96.0,9199.520833,1091.878295,7824.0,8462.0,8759.0,9714.75,13056.0
9,96.0,9295.677083,1054.210041,7695.0,8504.0,8822.5,10307.5,12265.0
10,96.0,9253.125,1042.698687,8049.0,8490.25,8766.0,10199.0,11856.0


### (3) 요일별

* 차량 운행수

In [21]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('weekday',as_index=True)[['차량운행']].describe()

Unnamed: 0_level_0,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Friday,418.0,485.107656,82.011846,178.0,444.0,488.5,539.0,690.0
Monday,417.0,485.014388,87.86684,197.0,443.0,485.0,543.0,678.0
Saturday,418.0,262.177033,35.57715,161.0,233.25,260.0,291.75,345.0
Sunday,417.0,224.767386,34.109708,132.0,201.0,218.0,245.0,333.0
Thursday,418.0,494.078947,89.615625,182.0,446.0,493.5,539.75,1413.0
Tuesday,417.0,495.076739,80.615743,182.0,449.0,500.0,541.0,814.0
Wednesday,417.0,484.726619,84.216908,197.0,440.0,486.0,540.0,692.0


* 접수건, 탑승건

In [22]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('weekday',as_index=True)[['접수건','탑승건']].describe()

Unnamed: 0_level_0,접수건,접수건,접수건,접수건,접수건,접수건,접수건,접수건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Friday,418.0,4692.066986,894.851008,805.0,4533.5,4940.5,5171.0,5990.0,418.0,3877.385167,727.064752,729.0,3778.5,4041.5,4244.75,5025.0
Monday,417.0,4715.040767,931.581903,1127.0,4597.0,4986.0,5208.0,6100.0,417.0,3961.242206,787.612615,1004.0,3908.0,4161.0,4372.0,5151.0
Saturday,418.0,2069.08134,326.397821,856.0,1908.0,2157.5,2298.0,2794.0,418.0,1693.839713,227.514641,770.0,1583.0,1723.5,1848.75,2199.0
Sunday,417.0,1702.781775,376.06611,527.0,1500.0,1869.0,1952.0,2154.0,417.0,1493.556355,303.130807,462.0,1367.0,1610.0,1695.0,1889.0
Thursday,418.0,4760.677033,841.902424,1023.0,4614.75,4964.0,5178.0,6133.0,418.0,3978.200957,694.12988,924.0,3868.0,4096.0,4332.75,5129.0
Tuesday,417.0,4800.884892,870.787606,1287.0,4706.0,5026.0,5233.0,6134.0,417.0,4026.791367,730.969389,1046.0,3939.0,4180.0,4412.0,5130.0
Wednesday,417.0,4739.333333,923.886493,1327.0,4613.0,4996.0,5226.0,6182.0,417.0,3956.976019,765.748662,1156.0,3872.0,4119.0,4362.0,5189.0


* 대기시간

In [23]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('weekday',as_index=True)[['평균대기시간']].describe()

Unnamed: 0_level_0,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Friday,418.0,40.956459,13.380374,17.2,32.175,39.3,48.95,96.1
Monday,417.0,39.243165,12.598604,18.4,29.5,37.6,46.6,82.0
Saturday,418.0,43.480144,16.476962,17.6,29.525,41.5,53.75,94.7
Sunday,417.0,34.902878,12.619141,17.8,24.2,32.9,41.7,80.8
Thursday,418.0,41.04378,13.444326,17.8,31.3,39.6,48.525,92.4
Tuesday,417.0,40.826139,13.776295,18.2,30.6,38.6,48.9,89.0
Wednesday,417.0,41.676259,14.542795,18.8,31.0,38.8,50.9,89.8


* 운임

In [24]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('weekday',as_index=True)[['평균요금']].describe()

Unnamed: 0_level_0,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Friday,418.0,2246.023923,55.505433,2131.0,2216.0,2240.0,2261.0,2649.0
Monday,417.0,2240.059952,55.938271,2146.0,2212.0,2233.0,2254.0,2628.0
Saturday,418.0,2429.641148,65.626115,2255.0,2383.5,2431.0,2473.75,2690.0
Sunday,417.0,2464.541966,46.598415,2284.0,2435.0,2463.0,2491.0,2685.0
Thursday,418.0,2256.366029,60.688792,2169.0,2228.0,2251.0,2268.75,2733.0
Tuesday,417.0,2242.805755,59.983116,2141.0,2213.0,2238.0,2258.0,2708.0
Wednesday,417.0,2250.422062,51.987372,2150.0,2223.0,2244.0,2264.0,2588.0


* 이동거리

In [25]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('weekday',as_index=True)[['평균승차거리']].describe()

Unnamed: 0_level_0,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
weekday,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Friday,418.0,8733.665072,558.52281,7672.0,8441.5,8661.5,8901.25,12516.0
Monday,417.0,8656.577938,589.888149,7748.0,8335.0,8583.0,8820.0,13056.0
Saturday,418.0,10431.4689,638.927763,8714.0,10013.75,10442.0,10892.0,12747.0
Sunday,417.0,10741.275779,447.586297,8987.0,10467.0,10722.0,11059.0,13904.0
Thursday,418.0,8786.901914,601.413752,7916.0,8495.5,8727.5,8953.75,14136.0
Tuesday,417.0,8672.11271,584.911046,7695.0,8373.0,8618.0,8815.0,12913.0
Wednesday,417.0,8752.038369,517.201469,7832.0,8460.0,8683.0,8933.0,11992.0


### (4) 월별

* 차량 운행수

In [26]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('month',as_index=True)[['차량운행']].describe()

Unnamed: 0_level_0,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,248.0,379.28629,125.021855,132.0,245.0,411.0,444.25,814.0
2,226.0,373.486726,125.424549,167.0,235.5,409.5,446.0,595.0
3,248.0,398.758065,123.967958,161.0,263.25,432.5,474.0,615.0
4,240.0,416.1875,123.552685,188.0,271.75,461.0,497.0,601.0
5,248.0,413.556452,128.678289,171.0,273.75,453.0,516.75,608.0
6,240.0,430.925,139.803009,196.0,285.25,469.5,519.0,1413.0
7,248.0,436.935484,125.948723,182.0,292.25,477.5,526.25,619.0
8,248.0,434.5,130.694842,189.0,282.0,484.0,527.5,685.0
9,240.0,446.4125,142.029671,201.0,295.0,498.5,542.0,700.0
10,248.0,432.939516,135.852595,173.0,289.0,491.0,531.0,661.0


* 접수건, 탑승건

In [27]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('month',as_index=True)[['접수건','탑승건']].describe()

Unnamed: 0_level_0,접수건,접수건,접수건,접수건,접수건,접수건,접수건,접수건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
1,248.0,3681.826613,1491.384086,591.0,1918.25,4407.5,4949.5,5493.0,248.0,3092.262097,1215.867502,551.0,1640.0,3753.5,4013.5,4504.0
2,226.0,3582.097345,1485.482395,856.0,1898.5,4411.0,4850.5,5240.0,226.0,3032.075221,1244.758189,770.0,1629.25,3744.5,4030.25,4496.0
3,248.0,3691.330645,1508.082608,527.0,2077.25,4611.0,4881.0,5407.0,248.0,3168.822581,1285.173427,462.0,1742.0,3978.0,4201.5,4665.0
4,240.0,3964.479167,1485.197345,623.0,2209.5,4867.5,5087.25,6075.0,240.0,3358.558333,1238.475408,591.0,1827.75,4085.5,4348.25,4882.0
5,248.0,3910.572581,1509.734828,1106.0,2171.0,4839.5,5117.5,5861.0,248.0,3278.435484,1271.347637,1012.0,1783.25,4005.0,4352.75,4850.0
6,240.0,4001.304167,1475.04074,1099.0,2309.5,4850.5,5169.0,6100.0,240.0,3372.8375,1229.861295,1013.0,1855.0,3939.0,4394.75,4983.0
7,248.0,4099.637097,1443.957079,859.0,2322.0,4724.0,5115.25,6092.0,248.0,3437.862903,1184.486245,809.0,1904.75,3981.0,4248.5,5083.0
8,248.0,4014.96371,1494.198219,619.0,2273.75,4778.5,5172.0,6134.0,248.0,3350.512097,1227.49092,580.0,1841.0,4002.5,4239.75,5120.0
9,240.0,3962.508333,1598.21154,655.0,2111.0,4864.0,5250.25,6182.0,240.0,3287.95,1302.137389,618.0,1783.25,4004.5,4271.5,5189.0
10,248.0,4007.709677,1589.318578,967.0,2153.25,4933.5,5240.75,6164.0,248.0,3316.850806,1307.951072,908.0,1776.5,4009.5,4345.0,5106.0


* 대기시간

In [28]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('month',as_index=True)[['평균대기시간']].describe()

Unnamed: 0_level_0,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간,평균대기시간
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,248.0,33.393952,10.88217,17.2,25.975,31.6,39.025,76.7
2,226.0,34.419469,13.158108,18.3,24.725,29.45,40.275,78.9
3,248.0,35.182258,14.182723,18.2,24.075,30.4,44.65,74.1
4,240.0,39.62875,13.241641,20.1,29.0,36.6,47.9,79.9
5,248.0,41.181452,12.241648,19.7,32.3,39.45,48.725,80.8
6,240.0,40.381667,12.817682,17.8,31.6,38.75,48.675,77.0
7,248.0,40.033468,11.055332,18.4,32.225,38.75,46.4,69.7
8,248.0,38.568952,11.605539,18.2,30.675,37.75,45.125,84.5
9,240.0,42.364167,15.125236,18.2,30.35,42.2,52.675,89.2
10,248.0,44.480242,13.215172,18.5,36.8,42.6,52.4,83.7


* 운임

In [29]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('month',as_index=True)[['평균요금']].describe()

Unnamed: 0_level_0,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금,평균요금
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,248.0,2279.616935,106.003793,2146.0,2204.75,2230.0,2372.0,2690.0
2,226.0,2296.623894,109.799717,2158.0,2217.0,2246.0,2382.0,2708.0
3,248.0,2291.600806,97.659648,2172.0,2224.0,2244.0,2360.75,2526.0
4,240.0,2310.7125,99.238,2157.0,2238.75,2261.0,2399.75,2544.0
5,248.0,2326.209677,102.864029,2164.0,2252.0,2277.5,2433.0,2537.0
6,240.0,2308.991667,103.091339,2173.0,2239.0,2261.0,2416.0,2565.0
7,248.0,2290.407258,97.705247,2152.0,2226.0,2251.0,2380.0,2551.0
8,248.0,2286.189516,98.496143,2141.0,2217.75,2246.0,2366.5,2573.0
9,240.0,2331.583333,123.131259,2160.0,2242.0,2270.0,2440.75,2733.0
10,248.0,2334.774194,113.621777,2180.0,2250.5,2275.0,2446.25,2640.0


* 이동거리

In [30]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('month',as_index=True)[['평균승차거리']].describe()

Unnamed: 0_level_0,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리,평균승차거리
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
month,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
1,248.0,9048.16129,1016.645061,7748.0,8311.25,8634.0,10038.25,12747.0
2,226.0,9221.039823,1113.085047,7863.0,8413.0,8744.5,10009.0,14136.0
3,248.0,9129.58871,930.761627,7967.0,8453.0,8742.5,9710.25,11490.0
4,240.0,9313.458333,957.384844,7806.0,8580.0,8970.0,10099.0,11701.0
5,248.0,9462.447581,1000.402602,7903.0,8728.5,9042.5,10338.0,12230.0
6,240.0,9292.65,961.064143,7993.0,8622.5,8875.5,10265.75,11421.0
7,248.0,9151.556452,934.211901,7872.0,8513.75,8759.0,9993.5,11428.0
8,248.0,9127.072581,954.195646,7695.0,8456.0,8756.5,9813.25,11642.0
9,240.0,9493.016667,1174.640641,7808.0,8630.0,8993.5,10463.75,13904.0
10,248.0,9485.576613,1048.753535,8027.0,8695.75,8960.5,10503.75,12228.0


### (5) 연도별

* 차량 운행수

In [31]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('year',as_index=True)[['차량운행']].describe()

Unnamed: 0_level_0,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행,차량운행
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
2015,365.0,381.468493,107.000499,163.0,252.0,440.0,460.0,507.0
2016,366.0,390.314208,128.274949,164.0,252.5,440.0,482.0,1413.0
2017,365.0,405.39726,125.298341,132.0,260.0,464.0,509.0,548.0
2018,365.0,397.758904,123.445819,164.0,270.0,434.0,507.0,570.0
2019,365.0,399.068493,125.626335,161.0,269.0,449.0,504.0,814.0
2020,366.0,416.521858,124.615104,185.0,296.0,435.0,516.5,654.0
2021,365.0,468.271233,133.933703,190.0,306.0,543.0,567.0,615.0
2022,365.0,490.89863,156.483781,209.0,304.0,558.0,604.0,700.0


* 접수건, 탑승건

In [32]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
df_taxi.groupby('year',as_index=True)[['접수건','탑승건']].describe()

Unnamed: 0_level_0,접수건,접수건,접수건,접수건,접수건,접수건,접수건,접수건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건,탑승건
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
2015,365.0,3750.531507,1286.48966,1023.0,2149.0,4471.0,4757.0,5379.0,365.0,3268.673973,1123.983935,924.0,1803.0,3938.0,4122.0,4450.0
2016,366.0,4005.087432,1399.214667,1319.0,2229.25,4837.5,5054.25,5769.0,366.0,3418.18306,1193.308077,1149.0,1832.25,4122.0,4326.0,4700.0
2017,365.0,4184.745205,1489.24804,1478.0,2259.0,5123.0,5297.0,5955.0,365.0,3471.709589,1246.805403,1203.0,1849.0,4240.0,4447.0,4882.0
2018,365.0,4077.473973,1433.818602,1419.0,2181.0,4985.0,5120.0,5605.0,365.0,3209.380822,1129.004514,1158.0,1700.0,3907.0,4079.0,4325.0
2019,365.0,4104.358904,1427.305709,1406.0,2246.0,4965.0,5141.0,5614.0,365.0,3230.279452,1124.524067,1046.0,1728.0,3890.0,4081.0,4363.0
2020,366.0,3137.480874,1387.445751,527.0,1729.0,3593.5,4318.75,5333.0,366.0,2656.314208,1122.990558,462.0,1493.25,3149.0,3610.0,4166.0
2021,365.0,3796.821918,1584.925666,591.0,1978.0,4666.0,5077.0,5572.0,365.0,3344.109589,1361.665167,551.0,1720.0,4123.0,4408.0,4816.0
2022,365.0,4350.30411,1714.018398,1057.0,2362.0,5117.0,5764.0,6182.0,365.0,3673.863014,1415.591273,974.0,1933.0,4397.0,4797.0,5189.0


## 3.데이터 구조 만들기

* **세부요구사항**
    * 조건 : 
        * 목표 : 전날 저녁, 다음날 평균 대기시간을 예측하고자 합니다.
        * 날씨 데이터는 실제 측정값이지만, 다음 날에 대한 예보 데이터로 간주합니다. 
            * 예를 들어, 
                * 2020-12-23 의 날씨 데이터는 전날(12월22일) 날씨예보 데이터로 간주하여 분석을 수행합니다.
                * 2020-12-22일의 장애인 이동 데이터로 23일의 대기시간을 예측해야 하며, 이때 고려할 날씨데이터는 23일 데이터 입니다.
    * 장애인 이동 데이터를 기준으로 날씨 데이터를 붙입니다.

In [33]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
taxi.rename(columns={'기준일':'Date'},inplace=True)
taxi.head()

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리
0,2015-01-01,213,1023,924,23.2,2427,10764
1,2015-01-02,420,3158,2839,17.2,2216,8611
2,2015-01-03,209,1648,1514,26.2,2377,10198
3,2015-01-04,196,1646,1526,24.5,2431,10955
4,2015-01-05,421,4250,3730,26.2,2214,8663


### (1) target 만들기
* 예측하는 날짜, 대기시간(target)으로 기준을 잡습니다.

In [34]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
# 익일의 대기시간(waiting time)을 오늘의 데이터를 활용하여 예측 해야하는 대상(target)으로 설정
taxi['target']=taxi['평균대기시간'].shift(-1)
taxi.tail()

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,target
2917,2022-12-27,669,5635,4654,44.4,2198,8178,44.8
2918,2022-12-28,607,5654,4648,44.8,2161,7882,52.5
2919,2022-12-29,581,5250,4247,52.5,2229,8433,38.3
2920,2022-12-30,600,5293,4200,38.3,2183,8155,33.7
2921,2022-12-31,263,2167,1806,33.7,2318,9435,


### (2) 날씨 데이터 붙이기
* merge를 활용합니다. 기준은 운행정보 입니다.

In [35]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
# 익일의 실제 날씨 데이터를 전일에 발표된 예보데이터로 판단
weather['Date']+=pd.Timedelta(days=-1)
data=pd.merge(taxi,weather)

In [36]:
weather.tail()

Unnamed: 0,Date,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2)
4013,2022-12-26,3.0,-7.3,0.0,86.0,51.0,10.25
4014,2022-12-27,-0.3,-5.4,0.1,92.0,40.0,10.86
4015,2022-12-28,1.7,-7.8,0.0,71.0,34.0,10.88
4016,2022-12-29,2.1,-4.0,0.0,87.0,38.0,10.84
4017,2022-12-30,-4.4,-4.4,0.0,66.0,66.0,0.0


In [37]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
data.head()

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,target,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2)
0,2015-01-01,213,1023,924,23.2,2427,10764,17.2,-2.0,-8.9,0.0,63.0,28.0,9.07
1,2015-01-02,420,3158,2839,17.2,2216,8611,26.2,2.4,-9.2,0.0,73.0,37.0,8.66
2,2015-01-03,209,1648,1514,26.2,2377,10198,24.5,8.2,0.2,0.0,89.0,58.0,5.32
3,2015-01-04,196,1646,1526,24.5,2431,10955,26.2,7.9,-0.9,0.0,95.0,52.0,6.48
4,2015-01-05,421,4250,3730,26.2,2214,8663,23.6,4.1,-7.4,3.4,98.0,29.0,10.47


### (3) 새로운 feature를 생성해 봅시다.
* 날짜와 관련된 변수 추가하기 : 요일, 월, 계절, 연도
* 그외 새로운 feature 도출 : 최소 2개 이상
    * 예 : 공휴일, 최근 7주일간의 평균 대기시간, 탑승률 등

#### 1) 날짜와 관련된 변수 추가하기 : 요일, 월, 계절, 연도
* 요일 이름, 계절이름, 월 이름으로 만드는 경우에는, 변수를 pd.Categorical로 범주형을 만들면서 순서를 지정하는 것이 이후 그래프를 그릴 때 순서대로 표현할 수 있습니다.


In [38]:
data['weekday'] = data['Date'].dt.day_name()
data['weekday'] = pd.Categorical(data['weekday'], 
                                  categories=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
data['month'] = data['Date'].dt.month

data['season'] = np.where(data['month'].isin([3,4,5]), 'Spring',
                           np.where(data['month'].isin([6,7,8]), 'Summer',
                                    np.where(data['month'].isin([9,10,11]), 'Fall', 'Winter')))
data['season'] = pd.Categorical(data['season'], categories=['Spring','Summer','Fall','Winter'])
                                     
data['year'] = data['Date'].dt.year
data

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,target,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2),weekday,month,season,year
0,2015-01-01,213,1023,924,23.2,2427,10764,17.2,-2.0,-8.9,0.0,63.0,28.0,9.07,Thursday,1,Winter,2015
1,2015-01-02,420,3158,2839,17.2,2216,8611,26.2,2.4,-9.2,0.0,73.0,37.0,8.66,Friday,1,Winter,2015
2,2015-01-03,209,1648,1514,26.2,2377,10198,24.5,8.2,0.2,0.0,89.0,58.0,5.32,Saturday,1,Winter,2015
3,2015-01-04,196,1646,1526,24.5,2431,10955,26.2,7.9,-0.9,0.0,95.0,52.0,6.48,Sunday,1,Winter,2015
4,2015-01-05,421,4250,3730,26.2,2214,8663,23.6,4.1,-7.4,3.4,98.0,29.0,10.47,Monday,1,Winter,2015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2916,2022-12-26,603,5555,4605,39.2,2163,7889,44.4,3.0,-7.3,0.0,86.0,51.0,10.25,Monday,12,Winter,2022
2917,2022-12-27,669,5635,4654,44.4,2198,8178,44.8,-0.3,-5.4,0.1,92.0,40.0,10.86,Tuesday,12,Winter,2022
2918,2022-12-28,607,5654,4648,44.8,2161,7882,52.5,1.7,-7.8,0.0,71.0,34.0,10.88,Wednesday,12,Winter,2022
2919,2022-12-29,581,5250,4247,52.5,2229,8433,38.3,2.1,-4.0,0.0,87.0,38.0,10.84,Thursday,12,Winter,2022


#### 2) 공휴일 정보
* workalendar 패키지를 설치하고, 대한민국 공휴일 정보를 끌어와 봅시다.

* 휴무일 데이터 패키지 설치

In [39]:
!pip install workalendar



* 간단 사용법

In [40]:
from workalendar.asia import SouthKorea
cal = SouthKorea()
pd.DataFrame(cal.holidays(2023))

Unnamed: 0,0,1
0,2023-01-01,New year
1,2023-01-21,Korean New Year's Day
2,2023-01-22,Korean New Year's Day
3,2023-01-23,Korean New Year's Day
4,2023-03-01,Independence Day
5,2023-05-05,Children's Day
6,2023-05-26,Buddha's Birthday
7,2023-06-06,Memorial Day
8,2023-08-15,Liberation Day
9,2023-09-28,Midautumn Festival


* 휴무일 데이터셋 만들기 2015 ~ 2022
* 실제로 휴무일에 해당하지만 workalendar 라이브러리에 없는 날짜는 직접 추가해봅시다.
    * 휴무일 장애인 콜택시의 접수건 변화에 대한 특징을 찾아 이를 바탕으로 데이터를 조회하여 찾아볼 수 있음

In [41]:
from workalendar.asia import SouthKorea

cal = SouthKorea()
holiday = pd.DataFrame()
for y in range(2015, 2023) :
    holiday = pd.concat([holiday, pd.DataFrame(cal.holidays(y))], axis = 0)

holiday.columns = ['Date', 'holiday']
holiday['Date'] = pd.to_datetime(holiday['Date'])
holiday['holiday'] = 1
holiday

Unnamed: 0,Date,holiday
0,2015-01-01,1
1,2015-02-18,1
2,2015-02-19,1
3,2015-02-20,1
4,2015-03-01,1
...,...,...
10,2022-09-10,1
11,2022-09-11,1
12,2022-10-03,1
13,2022-10-09,1


* 기존 데이터에 휴무일 정보 결합하기.
* 휴무일이 아닌 날짜는 0으로 저장하시오.

In [42]:
data = pd.merge(data, holiday, how = 'left')
data = data.fillna({'holiday':0})

In [43]:
data.tail()

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,target,temp_max,temp_min,rain(mm),humidity_max(%),humidity_min(%),sunshine(MJ/m2),weekday,month,season,year,holiday
2917,2022-12-26,603,5555,4605,39.2,2163,7889,44.4,3.0,-7.3,0.0,86.0,51.0,10.25,Monday,12,Winter,2022,0.0
2918,2022-12-27,669,5635,4654,44.4,2198,8178,44.8,-0.3,-5.4,0.1,92.0,40.0,10.86,Tuesday,12,Winter,2022,0.0
2919,2022-12-28,607,5654,4648,44.8,2161,7882,52.5,1.7,-7.8,0.0,71.0,34.0,10.88,Wednesday,12,Winter,2022,0.0
2920,2022-12-29,581,5250,4247,52.5,2229,8433,38.3,2.1,-4.0,0.0,87.0,38.0,10.84,Thursday,12,Winter,2022,0.0
2921,2022-12-30,600,5293,4200,38.3,2183,8155,33.7,-4.4,-4.4,0.0,66.0,66.0,0.0,Friday,12,Winter,2022,0.0


#### 3) 7일 이동평균 대기시간
* rolling().mean() 사용

In [44]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
data['7일간 평균대기시간']=data[['평균대기시간']].rolling(window=7).mean()

In [53]:
data.head()

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,target,temp_max,temp_min,...,sunshine(MJ/m2),weekday,month,season,year,holiday,7일간 평균대기시간,탑승률,차량당 접수건수,차량당 탑승건수
0,2015-01-01,213,1023,924,23.2,2427,10764,17.2,-2.0,-8.9,...,9.07,Thursday,1,Winter,2015,1.0,,0.903226,4.802817,4.338028
1,2015-01-02,420,3158,2839,17.2,2216,8611,26.2,2.4,-9.2,...,8.66,Friday,1,Winter,2015,0.0,,0.898987,7.519048,6.759524
2,2015-01-03,209,1648,1514,26.2,2377,10198,24.5,8.2,0.2,...,5.32,Saturday,1,Winter,2015,0.0,,0.918689,7.885167,7.244019
3,2015-01-04,196,1646,1526,24.5,2431,10955,26.2,7.9,-0.9,...,6.48,Sunday,1,Winter,2015,0.0,,0.927096,8.397959,7.785714
4,2015-01-05,421,4250,3730,26.2,2214,8663,23.6,4.1,-7.4,...,10.47,Monday,1,Winter,2015,0.0,,0.877647,10.095012,8.859857


#### 4) 탑승률

In [46]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
data['탑승률']=data['탑승건']/data['접수건']

In [47]:
data.head()

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,target,temp_max,temp_min,...,humidity_max(%),humidity_min(%),sunshine(MJ/m2),weekday,month,season,year,holiday,7일간 평균대기시간,탑승률
0,2015-01-01,213,1023,924,23.2,2427,10764,17.2,-2.0,-8.9,...,63.0,28.0,9.07,Thursday,1,Winter,2015,1.0,,0.903226
1,2015-01-02,420,3158,2839,17.2,2216,8611,26.2,2.4,-9.2,...,73.0,37.0,8.66,Friday,1,Winter,2015,0.0,,0.898987
2,2015-01-03,209,1648,1514,26.2,2377,10198,24.5,8.2,0.2,...,89.0,58.0,5.32,Saturday,1,Winter,2015,0.0,,0.918689
3,2015-01-04,196,1646,1526,24.5,2431,10955,26.2,7.9,-0.9,...,95.0,52.0,6.48,Sunday,1,Winter,2015,0.0,,0.927096
4,2015-01-05,421,4250,3730,26.2,2214,8663,23.6,4.1,-7.4,...,98.0,29.0,10.47,Monday,1,Winter,2015,0.0,,0.877647


#### 5) 차량운행당 평균대기시간

In [48]:
data['차량당 접수건수']=data['접수건']/data['차량운행']
data['차량당 탑승건수']=data['탑승건']/data['차량운행']

In [49]:
data.head()

Unnamed: 0,Date,차량운행,접수건,탑승건,평균대기시간,평균요금,평균승차거리,target,temp_max,temp_min,...,sunshine(MJ/m2),weekday,month,season,year,holiday,7일간 평균대기시간,탑승률,차량당 접수건수,차량당 탑승건수
0,2015-01-01,213,1023,924,23.2,2427,10764,17.2,-2.0,-8.9,...,9.07,Thursday,1,Winter,2015,1.0,,0.903226,4.802817,4.338028
1,2015-01-02,420,3158,2839,17.2,2216,8611,26.2,2.4,-9.2,...,8.66,Friday,1,Winter,2015,0.0,,0.898987,7.519048,6.759524
2,2015-01-03,209,1648,1514,26.2,2377,10198,24.5,8.2,0.2,...,5.32,Saturday,1,Winter,2015,0.0,,0.918689,7.885167,7.244019
3,2015-01-04,196,1646,1526,24.5,2431,10955,26.2,7.9,-0.9,...,6.48,Sunday,1,Winter,2015,0.0,,0.927096,8.397959,7.785714
4,2015-01-05,421,4250,3730,26.2,2214,8663,23.6,4.1,-7.4,...,10.47,Monday,1,Winter,2015,0.0,,0.877647,10.095012,8.859857


## 4.데이터 저장
* **세부요구사항**
    * joblib 을 사용하여 작업 경로에 정리한 데이터프레임을 저장합니다.
        * 저장파일이름 : data1.pkl

In [50]:
data=data.drop(data.index[-1])

In [51]:
# 아래에 실습코드를 작성하고 결과를 확인합니다.
joblib.dump(data,'data1.pkl')

['data1.pkl']

In [52]:
data.to_csv('data1.csv')

## 2. 데이터 분석

## 3. 모델링 및 가설 검증