## NYC Taxi Fare Prediction with PyTorch

### *read data (csv)* -> *data preprocess* -> *train model* -> *predict using the trained model*
### pandas -> pandas.abs() -> Torch (SGD) -> ??


### To run it on mutlti-nodes, need to run below shell:

In [None]:
! horovodrun -np 4 -H hdfs1:2,hdfs2:2 python /home/sparkuser/jupyter/Bin/NYC_Taxi_Fare/pytorch_horovod.py

[1,2]<stdout>:Data Load Consuming Time:
[1,2]<stdout>:0:01:20.078336
[1,2]<stdout>:finished
[1,2]<stdout>:key                    0
[1,2]<stdout>:fare_amount            0
[1,2]<stdout>:pickup_datetime        0
[1,2]<stdout>:pickup_longitude       0
[1,2]<stdout>:pickup_latitude        0
[1,2]<stdout>:dropoff_longitude    376
[1,2]<stdout>:dropoff_latitude     376
[1,2]<stdout>:passenger_count        0
[1,2]<stdout>:dtype: int64
[1,2]<stdout>:Old size 55423856
[1,3]<stdout>:Data Load Consuming Time:
[1,3]<stdout>:0:01:25.886857
[1,3]<stdout>:finished
[1,3]<stdout>:key                    0
[1,3]<stdout>:fare_amount            0
[1,3]<stdout>:pickup_datetime        0
[1,3]<stdout>:pickup_longitude       0
[1,3]<stdout>:pickup_latitude        0
[1,3]<stdout>:dropoff_longitude    376
[1,3]<stdout>:dropoff_latitude     376
[1,3]<stdout>:passenger_count        0
[1,3]<stdout>:dtype: int64
[1,3]<stdout>:Old size 55423856
[1,2]<stdout>:New size 55423480
[1,3]<stdout>:New size 55423480
[1,2]<stdo

In [1]:
import numpy as np
import pandas as pd
import os
# Add horovod with torch import
import horovod.torch as hvd
from datetime import datetime
%matplotlib inline

### Initialize Horovod

In [2]:
hvd.init()

### load data by pandas

In [4]:
time1=datetime.now()
PATH = './input'
os.listdir(PATH)
train_df = pd.read_csv(f'{PATH}/train.csv')
#data size: 5.4GB 
time2=datetime.now()
data_load_time=time2-time1
print("Data Load Consuming Time:")
print(data_load_time)
print("finished")
train_df.dtypes

Data Load Consuming Time:
0:01:17.617604
finished


key                   object
fare_amount          float64
pickup_datetime       object
pickup_longitude     float64
pickup_latitude      float64
dropoff_longitude    float64
dropoff_latitude     float64
passenger_count        int64
dtype: object

### data processing

In [5]:
time3=datetime.now()
# Check NaNs in the dataset
print(train_df.isnull().sum())


print('Old size %d'% len(train_df))
train_df = train_df.dropna(how='any',axis='rows')
print('New size %d' % len(train_df))
# travel vectors between start and end points for the taxi ride, in both longitude and latitude coordinates 
# Given a dataframe, add two new features 'abs_diff_longitude' and
# 'abs_diff_latitude' reprensenting the "Manhattan vector" from
# the pickup location to the dropoff location.

# remove the bizzare travelling distance
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()
add_travel_vector_features(train_df)

#plot a subset of travel vector to see its distribution 
#plot = train_df.iloc[:2000].plot.scatter('abs_diff_longitude', 'abs_diff_latitude')

#there are some further data processing skiped below

#We expect most of these values to be very small (likely between 0 and 1) since it should all 
#be differences between GPS coordinates within one city. For reference, one degree of latitude is about 69 miles. 
#However, we can see the dataset has extreme values which do not make sense. 
#Let's remove those values from our training set. Based on the scatterplot, 
#it looks like we can safely exclude values above 5 (though remember the scatterplot is only showing the first 2000 rows...)

#print('Old size: %d' % len(train_df))
#train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
#print('New size: %d' % len(train_df))


train_df = train_df[(train_df.abs_diff_longitude<5) & (train_df.abs_diff_latitude<5)]
print(len(train_df))

time4=datetime.now()
data_processing_time=time4-time3
print("Data Processing Consuming Time:")
print(data_processing_time)
data_prepare_time=data_load_time+data_processing_time
print("Data prepare Consuming Time:")
print(data_prepare_time)

key                    0
fare_amount            0
pickup_datetime        0
pickup_longitude       0
pickup_latitude        0
dropoff_longitude    376
dropoff_latitude     376
passenger_count        0
dtype: int64
Old size 55423856
New size 55423480
55308916
Data Processing Consuming Time:
0:00:24.247500
Data prepare Consuming Time:
0:01:41.865104


### model training using Horovod with Pytorch

In [None]:
import torch
import torch.nn as nn
from torch.autograd import Variable
from sklearn.model_selection import train_test_split

time5=datetime.now()

#A sequential container. Modules will be added to it in the order they are passed in the constructor. 
#Alternatively, an ordered dict of modules can also be passed in.

#一个有序的容器，神经网络模块将按照在传入构造器的顺序依次被添加到计算图中执行，
#同时以神经网络模块为元素的有序字典也可以作为传入参数。

#a three layer NN model 

model = nn.Sequential(nn.Linear(2, 10),
                     nn.Linear(10, 5),
                      nn.Linear(5, 1))

criterion = torch.nn.MSELoss()

# bigger learning rate 
optimizer1 = torch.optim.SGD(model.parameters(), lr=0.01)

#Add Horovod Distributed Optimizer
optimizer = hvd.DistributedOptimizer(optimizer1, named_parameters=model.named_parameters())
# Broadcast parameters from rank 0 to all other processes.
hvd.broadcast_parameters(model.state_dict(), root_rank=0)

X = np.stack((train_df.abs_diff_latitude.values,train_df.abs_diff_longitude.values)).T
X = torch.from_numpy(X)
X = X.type(torch.FloatTensor)

y = torch.from_numpy(train_df.fare_amount.values.T)
y = y.type(torch.FloatTensor)
y.unsqueeze_(-1)
X_train, X_evalutation, y_train, y_evalutation = train_test_split(X, y, test_size = 0.3, random_state = 0)
print("Train Size:")
print(len(X_train))

for epoch in range(100):
    # Forward Propagation
    y_pred = model(X_train)
    # Compute and print loss
    loss = criterion(y_pred, y_train)
    print('epoch: ', epoch,' loss: ', loss.item())
    # Zero the gradients
    optimizer.zero_grad()
    
    # perform a backward pass (backpropagation)
    loss.backward()
    
    # Update the parameters
    optimizer.step()

    # smaller LR 
optimizer2 = torch.optim.SGD(model.parameters(), lr=0.001)

# Met some issue when using horovord, so we switched to single nodes 
# Next: Figure out the reason 
#optimizer = hvd.DistributedOptimizer(optimizer2, named_parameters=model.named_parameters())
# Broadcast parameters from rank 0 to all other processes.
#hvd.broadcast_parameters(model.state_dict(), root_rank=0)

for epoch in range(1500):
    # Forward Propagation
    y_pred = model(X_train)
    # Compute and print loss
    loss = criterion(y_pred, y_train)
    print('epoch: ', epoch,' loss: ', loss.item())
    # Zero the gradients
    
    
    optimizer.zero_grad()
    
    # perform a backward pass (backpropagation)
    loss.backward()
    
    # Update the parameters
    optimizer.step()

time6=datetime.now()
model_train_time=time6-time5
print("Model Train Consuming Time:")
print(model_train_time)

Train Size:
38716241
epoch:  0  loss:  309.7013244628906
epoch:  1  loss:  296.62841796875
epoch:  2  loss:  281.2621765136719
epoch:  3  loss:  260.4698181152344
epoch:  4  loss:  233.14085388183594
epoch:  5  loss:  205.31649780273438
epoch:  6  loss:  192.37327575683594
epoch:  7  loss:  191.79986572265625
epoch:  8  loss:  191.70750427246094
epoch:  9  loss:  191.6248321533203
epoch:  10  loss:  191.5419464111328
epoch:  11  loss:  191.4582061767578
epoch:  12  loss:  191.37356567382812
epoch:  13  loss:  191.28790283203125
epoch:  14  loss:  191.2012176513672
epoch:  15  loss:  191.11337280273438
epoch:  16  loss:  191.02439880371094
epoch:  17  loss:  190.93421936035156
epoch:  18  loss:  190.84268188476562
epoch:  19  loss:  190.74977111816406
epoch:  20  loss:  190.65550231933594
epoch:  21  loss:  190.5596923828125
epoch:  22  loss:  190.46214294433594
epoch:  23  loss:  190.36280822753906
epoch:  24  loss:  190.26174926757812
epoch:  25  loss:  190.1588592529297
epoch:  26  l

## Evalutation 

In [6]:
time7=datetime.now()

def RMSE(x,y):
    criterion = nn.MSELoss()
    loss = torch.sqrt(criterion(x, y))
    return loss
print(X_evalutation)
y_evalutation_result=model(X_evalutation)
print (y_evalutation_result)

rmse=RMSE(y_evalutation_result,y_evalutation)

print("RMSE Value:")
print(rmse)

time8=datetime.now()
evalutation_time=time8-time7
print("Evalutation Consuming Time:")
print(evalutation_time)

print("Data Load Consuming Time:")
print(data_load_time)
print("Data prepare Consuming Time:")
print(data_prepare_time)
print("Model Train Consuming Time:")
print(model_train_time)
print("Evalutation Consuming Time:")
print(evalutation_time)
total_time=data_load_time+data_prepare_time+model_train_time+evalutation_time
print("Total Consuming Time:")
print(total_time)

tensor([[0.0238, 0.0207],
        [0.0485, 0.0407],
        [0.0046, 0.0031],
        ...,
        [0.0869, 0.0625],
        [0.0261, 0.1157],
        [0.0167, 0.0270]])
tensor([[11.2868],
        [16.1786],
        [ 7.1883],
        ...,
        [22.4268],
        [25.2755],
        [11.6330]], grad_fn=<AddmmBackward>)
RMSE Value:
tensor(6.7698, grad_fn=<SqrtBackward>)
Evalutation Consuming Time:
0:00:00.066143
Data Load Consuming Time:
0:00:13.623180
Data prepare Consuming Time:
0:00:17.561898
Model Train Consuming Time:
0:23:56.400328
Evalutation Consuming Time:
0:00:00.066143
Total Consuming Time:
0:24:27.651549


### Predict

In [7]:
test_df = pd.read_csv(f'{PATH}/test.csv')
add_travel_vector_features(test_df)
X_test = np.stack((test_df.abs_diff_latitude.values,test_df.abs_diff_longitude.values)).T
X_test = torch.from_numpy(X_test)
X_test = X_test.type(torch.FloatTensor)
y_test = model(X_test)

y_test = y_test.detach().numpy()
y_test = y_test.reshape(-1)

submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': y_test},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)

print(y_test)

print(os.listdir('.'))

[ 9.144782  9.701579  7.190537 ... 42.10261  18.43134   8.558159]
['input', 'submission.csv', 'PyTorch_NY_Taxi_Fare_Predict_backup.ipynb', '.ipynb_checkpoints', 'PyTorch_NY_Taxi_Fare_Predict.ipynb']
