# 部署模型

## 基于2021年3月26日的数据，我们需要预测2021年3月27日前一个半小时的叫车需求。

## 1.导入包

In [22]:
import pandas as pd
import numpy as np
from joblib import load, dump
from sklearn.cluster import MiniBatchKMeans, KMeans
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt, ceil, floor
from datetime import datetime, timedelta


## 2.导入模型和数据

In [23]:
df= pd.read_csv("../data/cleaned_test_booking_data.csv", compression = 'gzip', low_memory=False)
cluster_model = load("../Model/pickup_cluster.joblib")
predict_with_lag = load("../Model/prediction_model.joblib")

## 3.特征工程

In [24]:
# 整理时间的相关函数

def round_timestamp_30interval(x):
    if type(x)==str:
        x = datetime.strptime(x, '%Y-%m-%d %H:%M:%S')
    return x- timedelta(minutes=x.minute%30, seconds=x.second, microseconds=x.microsecond)

def time_features(data):
    data['mins'] = data.ts.dt.minute
    data['hour'] = data.ts.dt.hour
    data['day'] = data.ts.dt.day
    data['month'] = data.ts.dt.month
    data['dayofweek'] = data.ts.dt.dayofweek
    data['quarter'] = data.ts.dt.quarter
    return data

def prediction_with_lag(df):
    return predict_with_lag.predict(df[['pickup_cluster', 'mins', 'hour', 'month', 'quarter',
           'dayofweek', 'lag_1', 'lag_2', 'lag_3','rolling_mean']])

def shift_with_lag_and_rollingmean(df):
    df = df.sort_values(by=['pickup_cluster', 'ts']).drop_duplicates(subset=['ts','pickup_cluster'])
    df = df.set_index(['ts', 'pickup_cluster', 'mins', 'hour', 'month', 'quarter', 'dayofweek'])
    df['lag_1'] = df.groupby(level=['pickup_cluster'])['request_count'].shift(1)
    df['lag_2'] = df.groupby(level=['pickup_cluster'])['request_count'].shift(2)
    df['lag_3'] = df.groupby(level=['pickup_cluster'])['request_count'].shift(3)
    df['rolling_mean'] = df.groupby(level=['pickup_cluster'])['request_count'].apply(lambda x: x.rolling(window = 3).mean()).shift(1)

    df = df.reset_index(drop = False).dropna()
    df = df[['ts', 'pickup_cluster', 'mins', 'hour', 'month', 'quarter',
           'dayofweek', 'lag_1', 'lag_2', 'lag_3','rolling_mean','request_count']]
    return df

## 3.1构建pick_cluster

In [25]:
df['pickup_cluster'] = cluster_model.predict(df[['pick_lat', 'pick_lng']])
df.head(10)




Unnamed: 0,ts,number,pick_lat,pick_lng,drop_lat,drop_lng,pickup_cluster
0,2021-03-26 06:49:38,-1,12.903468,77.63708,12.916259,77.675476,45
1,2021-03-26 15:14:23,0,12.903838,77.591774,12.890039,77.59372,1
2,2021-03-26 15:57:32,6,12.963516,77.67474,12.912828,77.62731,38
3,2021-03-26 23:34:53,7,12.944017,77.56427,12.967625,77.60806,35
4,2021-03-26 23:45:56,9,12.98327,77.75207,12.963221,77.7484,16
5,2021-03-26 18:54:05,11,12.919469,77.6711,12.933288,77.60731,34
6,2021-03-26 18:42:49,15,12.947335,77.68431,12.974627,77.606064,31
7,2021-03-26 23:14:56,15,12.979332,77.64059,12.947475,77.68423,17
8,2021-03-26 10:59:13,17,12.923716,77.60741,12.922842,77.59324,28
9,2021-03-26 16:44:09,53,12.888448,77.57724,12.937987,77.568726,26


## 3.2聚合时间需求

In [26]:
df['ts'] = np.vectorize(round_timestamp_30interval)(df['ts'])
df['ts'] = pd.to_datetime(df['ts'])


df = df[['ts','number','pickup_cluster']]
df=df.groupby(by = ['ts','pickup_cluster']).count().reset_index()
df.columns = ['ts','pickup_cluster','request_count']

l = [datetime(2021,3,26,00,00,00)+timedelta(minutes=30*i) for i in range(0,51)]
#构建一个空的列表df
lt = []

#遍历时间戳
for x in l:
    lt.append([x,-1,0])

temp = pd.DataFrame(lt,columns=['ts','pickup_cluster','request_count'])
df = df.append(temp,ignore_index=True)

data = df.set_index(['ts', 'pickup_cluster']).unstack().fillna(value=0).asfreq(freq='30Min').stack().sort_index(level=1).reset_index()

data = data[data.pickup_cluster>=0]

df = time_features(data)

In [27]:
df

Unnamed: 0,ts,pickup_cluster,request_count,mins,hour,day,month,dayofweek,quarter
51,2021-03-26 00:00:00,0,0.0,0,0,26,3,4,1
52,2021-03-26 00:30:00,0,0.0,30,0,26,3,4,1
53,2021-03-26 01:00:00,0,0.0,0,1,26,3,4,1
54,2021-03-26 01:30:00,0,0.0,30,1,26,3,4,1
55,2021-03-26 02:00:00,0,0.0,0,2,26,3,4,1
...,...,...,...,...,...,...,...,...,...
2596,2021-03-26 23:00:00,49,9.0,0,23,26,3,4,1
2597,2021-03-26 23:30:00,49,6.0,30,23,26,3,4,1
2598,2021-03-27 00:00:00,49,0.0,0,0,27,3,5,1
2599,2021-03-27 00:30:00,49,0.0,30,0,27,3,5,1


## 4.预测需求量

In [32]:
start_date = datetime(2021,3,27,00,00,00)
for i in range(3):
    df = shift_with_lag_and_rollingmean(df)
    df.loc[df[df['ts']==start_date+timedelta(minutes=(30*x))].index,'request_count'] = prediction_with_lag(df[df['ts']==start_date+timedelta(minutes=(30*x))])

In [33]:
data_pred = df[df['ts']>=start_date].reset_index(drop=True)
data_pred.sort_values(by=['pickup_cluster','ts'],inplace=True)
data_pred

Unnamed: 0,ts,pickup_cluster,mins,hour,month,quarter,dayofweek,lag_1,lag_2,lag_3,rolling_mean,request_count
0,2021-03-27 00:00:00,0,0,0,3,1,5,0.000000,0.000000,5.0,1.666667,0.600623
1,2021-03-27 00:30:00,0,30,0,3,1,5,0.600623,0.000000,0.0,0.200208,0.310167
2,2021-03-27 01:00:00,0,0,1,3,1,5,0.310167,0.600623,0.0,0.303597,0.193206
3,2021-03-27 00:00:00,1,0,0,3,1,5,8.000000,23.000000,33.0,21.333333,6.667937
4,2021-03-27 00:30:00,1,30,0,3,1,5,6.667937,8.000000,23.0,12.555979,5.121644
...,...,...,...,...,...,...,...,...,...,...,...,...
145,2021-03-27 00:30:00,48,30,0,3,1,5,1.154080,1.000000,4.0,2.051360,0.666718
146,2021-03-27 01:00:00,48,0,1,3,1,5,0.666718,1.154080,1.0,0.940266,0.398013
147,2021-03-27 00:00:00,49,0,0,3,1,5,6.000000,9.000000,8.0,7.666667,2.842054
148,2021-03-27 00:30:00,49,30,0,3,1,5,2.842054,6.000000,9.0,5.947351,1.957306


In [None]:
data_pred.to_csv("../data/prediction_result.csv",index=False)