In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns # for statistical visualization
import matplotlib.pyplot as plt
plt.style.use('ggplot') # Set style for plotting

import dask.dataframe as dd
import os
from tqdm import tqdm

TRAIN_PATH = '../input/train.csv'
TEST_PATH = '../input/test.csv'
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
def read_data(PATH, traintypes):
    # Set columns to most suitable type to optimize for memory usage
    
    cols = list(traintypes.keys())
    chunksize = 5_000_000 # 5 million rows at one go. Or try 10 million
    %%time
    df_list = [] # list to hold the batch dataframe
    for df_chunk in tqdm(pd.read_csv(PATH, usecols=cols, dtype=traintypes, chunksize=chunksize)):
            # Neat trick from https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
            # Using parse_dates would be much slower!
            df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
            df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
            # Can process each chunk of dataframe here
            # clean_data(), feature_engineer(),fit()
            # Alternatively, append the chunk to list and merge all
            df_list.append(df_chunk)
    return df_list

In [None]:
traintypes = {'fare_amount': 'float32',
              'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

# Merge all dataframes into one dataframe
df_list = read_data(TRAIN_PATH, traintypes)
train_df = pd.concat(df_list[0:4])

# Delete the dataframe list to release memory
del df_list

# See what we have loaded
display(train_df.info())

In [None]:
display(train_df.head())
display(train_df.tail())
display(train_df.describe())

In [None]:
train_df.to_feather('nyc_taxi_data_raw.feather')


In [None]:
train_df = pd.read_feather('nyc_taxi_data_raw.feather')


In [None]:
plot = train_df.hist(bins=50, figsize=(20,15))

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[train_df.fare_amount>0]
print('New size: %d' % len(train_df))

In [None]:
print(train_df.isnull().sum())

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_df))

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[train_df.passenger_count<=5]
print('New size: %d' % len(train_df))

In [None]:
# minimum and maximum longitude train set
min(train_df.pickup_longitude.min(), train_df.dropoff_longitude.min()), \
max(train_df.pickup_longitude.max(), train_df.dropoff_longitude.max())

In [None]:
# minimum and maximum latitude train set
min(train_df.pickup_latitude.min(), train_df.dropoff_latitude.min()), \
max(train_df.pickup_latitude.max(), train_df.dropoff_latitude.max())

In [None]:
def select_within_nyc(df, BB):
    return (df.pickup_longitude >= BB[0]) & (df.pickup_longitude <= BB[1]) & \
           (df.pickup_latitude >= BB[2]) & (df.pickup_latitude <= BB[3]) & \
           (df.dropoff_longitude >= BB[0]) & (df.dropoff_longitude <= BB[1]) & \
           (df.dropoff_latitude >= BB[2]) & (df.dropoff_latitude <= BB[3])
            

In [None]:
BB = (-74.5, -72.8, 40.5, 41.8)

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[select_within_nyc(train_df, BB)]
print('New size: %d' % len(train_df))

In [None]:
plot = train_df.hist(bins=100, figsize=(20,15))

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[train_df.fare_amount<=100]
print('New size: %d' % len(train_df))

In [None]:
plot = train_df.hist(bins=100, figsize=(20,15))

In [None]:
def haversine_np(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees)

    All args must be of equal length.    

    """
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2.0)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2.0)**2

    c = 2 * np.arcsin(np.sqrt(a))
    km = 6367 * c
    return km

In [None]:
train_df["distance"] = haversine_np(train_df.pickup_longitude, train_df.pickup_latitude, train_df.dropoff_longitude, train_df.dropoff_latitude)

In [None]:
print('Old size: %d' % len(train_df))
train_df = train_df[train_df.distance<=40]
print('New size: %d' % len(train_df))

In [None]:
train_df.distance.hist(bins = 50)

In [None]:
#train_df['pickup_date'] = train_df.pickup_datetime.dt.date
#train_df['pickup_day'] = train_df.pickup_datetime.dt.day
train_df['pickup_month'] = train_df.pickup_datetime.dt.month
train_df['pickup_weekaday'] = train_df.pickup_datetime.dt.weekday
#train_df['pickup_weekoftheyear'] = train_df.pickup_datetime.dt.weekofyear
train_df['pickup_hour'] = train_df.pickup_datetime.dt.hour
#train_df['pickup_minute'] = train_df.pickup_datetime.dt.minute
#train_df['pickup_dayofyear'] = train_df.pickup_datetime.dt.dayofyear

In [None]:
idx = select_within_boundingbox(train_df,(-74.1, -73.7, 40.6, 40.9))
plot = train_df[idx & train_df.fare_amount<50].plot(kind="scatter", x="pickup_longitude", y="pickup_latitude", 
                     alpha=0.5, figsize=(40, 30), s=0.01,
                    c='fare_amount', cmap=plt.get_cmap('jet'), colorbar=True)

In [None]:
def plot_hires(df, BB, figsize=(12, 12), ax=None, c=('r', 'b')):
    if ax == None:
        fig, ax = plt.subplots(1, 1, figsize=figsize)

    idx = select_within_boundingbox(df, BB)
    ax.scatter(df[idx].pickup_longitude, df[idx].pickup_latitude, c=c[0], s=0.01, alpha=0.5)
    ax.scatter(df[idx].dropoff_longitude, df[idx].dropoff_latitude, c=c[1], s=0.01, alpha=0.5)

In [None]:
plot_hires(train_df, (-74.1, -73.7, 40.6, 40.9))
plot_hires(train_df, (-74, -73.95, 40.7, 40.8))

In [None]:
from bokeh.plotting import figure, output_notebook, show # bokeh plotting library
# We'll show the plots in the cells of this notebook
output_notebook()

NYC = x_range, y_range = ((-74.05, -73.7), (40.6, 40.9))
plot_width = int(750)
plot_height = int(plot_width//1.2)

import datashader as ds
from datashader import transfer_functions as tr_fns
from datashader.colors import Greys9
Greys9_r = list(reversed(Greys9))[:2]
from datashader.bokeh_ext import InteractiveImage
from functools import partial
from datashader.utils import export_image
from datashader.colors import colormap_select, Greys9, Hot, viridis, inferno
from IPython.core.display import HTML, display

def base_plot(tools='pan, wheel_zoom, reset', plot_width=plot_width, plot_height=plot_height, **plot_args):
    p = figure(tools=tools, plot_width=plot_width, plot_height=plot_height,
              x_range=x_range, y_range=y_range, outline_line_color=None,
              min_border=0, min_border_left=0, min_border_right=0,
              min_border_top=0, min_border_bottom=0, **plot_args)
    
    p.xgrid.grid_line_color = None
    p.ygrid.grid_line_color = None
    return p
background = "black"
export = partial(export_image, export_path="export", background=background)
cm = partial(colormap_select, reverse=(background=="black"))

def create_image(x_range, y_range, w=plot_width, h=plot_height):
    cvs = ds.Canvas(plot_width=w, plot_height=h, x_range=x_range, y_range=y_range)
    agg = cvs.points(train_df, 'dropoff_longitude', 'dropoff_latitude', ds.count('passenger_count'))
    img = tr_fns.shade(agg, cmap=Hot, how='eq_hist')
    return tr_fns.dynspread(img, threshold=0.5, max_px=4)

p = base_plot(background_fill_color=background)
export(create_image(*NYC), "NYCT_hot")
InteractiveImage(p, create_image)

In [None]:
corr_matrix = train_df.corr()

In [None]:
corr_matrix.fare_amount.sort_values(ascending=False)

In [None]:
corr_matrix.distance.sort_values(ascending=False)

In [None]:
train_df.plot.scatter(x = "distance", y = "fare_amount", s = 0.4, figsize=(30,20))

In [None]:
train_df.plot.scatter(x = "pickup_hour", y = "fare_amount", s = 0.7, figsize=(20,10))

In [None]:
train_df.plot.scatter(x = "pickup_hour", y = "distance", s = 0.7, figsize=(20,10))

In [None]:
def add_airport_dist(dataset):
    """
    Return minumum distance from pickup or dropoff coordinates to each airport.
    JFK: John F. Kennedy International Airport
    EWR: Newark Liberty International Airport
    LGA: LaGuardia Airport
    SOL: Statue of Liberty 
    NYC: Newyork Central
    """
    sphere_dist = haversine_np
    jfk_coord = (40.639722, -73.778889)
    ewr_coord = (40.6925, -74.168611)
    lga_coord = (40.77725, -73.872611)
    sol_coord = (40.6892,-74.0445) # Statue of Liberty
    nyc_coord = (40.7141667,-74.0063889) 
    
    
    pickup_lat = dataset['pickup_latitude']
    dropoff_lat = dataset['dropoff_latitude']
    pickup_lon = dataset['pickup_longitude']
    dropoff_lon = dataset['dropoff_longitude']
    
    pickup_jfk = sphere_dist(pickup_lat, pickup_lon, jfk_coord[0], jfk_coord[1]) 
    dropoff_jfk = sphere_dist(jfk_coord[0], jfk_coord[1], dropoff_lat, dropoff_lon) 
    pickup_ewr = sphere_dist(pickup_lat, pickup_lon, ewr_coord[0], ewr_coord[1])
    dropoff_ewr = sphere_dist(ewr_coord[0], ewr_coord[1], dropoff_lat, dropoff_lon) 
    pickup_lga = sphere_dist(pickup_lat, pickup_lon, lga_coord[0], lga_coord[1]) 
    dropoff_lga = sphere_dist(lga_coord[0], lga_coord[1], dropoff_lat, dropoff_lon)
    pickup_sol = sphere_dist(pickup_lat, pickup_lon, sol_coord[0], sol_coord[1]) 
    dropoff_sol = sphere_dist(sol_coord[0], sol_coord[1], dropoff_lat, dropoff_lon)
    pickup_nyc = sphere_dist(pickup_lat, pickup_lon, nyc_coord[0], nyc_coord[1]) 
    dropoff_nyc = sphere_dist(nyc_coord[0], nyc_coord[1], dropoff_lat, dropoff_lon)
    
    
    
    dataset['jfk_dist'] = pickup_jfk + dropoff_jfk
    dataset['ewr_dist'] = pickup_ewr + dropoff_ewr
    dataset['lga_dist'] = pickup_lga + dropoff_lga
    dataset['sol_dist'] = pickup_sol + dropoff_sol
    dataset['nyc_dist'] = pickup_nyc + dropoff_nyc
    
    return dataset

In [None]:
train_df = add_airport_dist(train_df)

In [None]:
def sphere_dist_bear(pickup_lat, pickup_lon, dropoff_lat, dropoff_lon):
    """
    Return distance along great radius between pickup and dropoff coordinates.
    """
    #Define earth radius (km)
    R_earth = 6371
    #Convert degrees to radians
    pickup_lat, pickup_lon, dropoff_lat, dropoff_lon = map(np.radians,
                                                             [pickup_lat, pickup_lon, 
                                                              dropoff_lat, dropoff_lon])
    #Compute distances along lat, lon dimensions
    dlat = dropoff_lat - pickup_lat
    dlon = pickup_lon - dropoff_lon
    
    #Compute bearing distance
    a = np.arctan2(np.sin(dlon * np.cos(dropoff_lat)),np.cos(pickup_lat) * np.sin(dropoff_lat) - np.sin(pickup_lat) * np.cos(dropoff_lat) * np.cos(dlon))
    return a

In [None]:
def radian_conv(degree):
    """
    Return radian.
    """
    return  np.radians(degree)    


In [None]:
train_df['bearing'] = sphere_dist_bear(train_df['pickup_latitude'], train_df['pickup_longitude'], 
                                   train_df['dropoff_latitude'] , train_df['dropoff_longitude'])                                    
train_df['pickup_latitude'] = radian_conv(train_df['pickup_latitude'])
train_df['pickup_longitude'] = radian_conv(train_df['pickup_longitude'])
train_df['dropoff_latitude'] = radian_conv(train_df['dropoff_latitude'])
train_df['dropoff_longitude'] = radian_conv(train_df['dropoff_longitude'])


In [None]:
import gc
gc.collect()
train_df.columns

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

model_lin = Pipeline((
        ("standard_scaler", StandardScaler()),
        ("lin_reg", LinearRegression()),
    ))
model_lin.fit(train_df[['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'distance',
       'pickup_month', 'pickup_weekaday', 'pickup_hour', 'jfk_dist',
       'ewr_dist', 'lga_dist', 'sol_dist', 'nyc_dist', 'bearing']].values, train_df.fare_amount.values)

In [None]:
linear_regressor = model_lin.named_steps['lin_reg']

In [None]:
meanplot = pd.DataFrame([np.abs(linear_regressor.coef_)], columns=['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'distance',
       'pickup_month', 'pickup_weekaday', 'pickup_hour', 'jfk_dist',
       'ewr_dist', 'lga_dist', 'sol_dist', 'nyc_dist', 'bearing'])

In [None]:
meanplot.plot.bar(figsize = (20,10))

In [None]:
import lightgbm as lgbm
import gc


In [None]:
testtypes = { 'pickup_datetime': 'str', 
              'pickup_longitude': 'float32',
              'pickup_latitude': 'float32',
              'dropoff_longitude': 'float32',
              'dropoff_latitude': 'float32',
              'passenger_count': 'uint8'}

# Merge all dataframes into one dataframe
df_list = read_data(TEST_PATH, testtypes)
test_df = pd.concat(df_list)

# Delete the dataframe list to release memory
del df_list

# See what we have loaded
display(test_df.info())

In [None]:
test_df["distance"] = haversine_np(test_df.pickup_longitude, test_df.pickup_latitude, test_df.dropoff_longitude, test_df.dropoff_latitude)
test_df['pickup_month'] = test_df.pickup_datetime.dt.month
test_df['pickup_weekaday'] = test_df.pickup_datetime.dt.weekday
#train_df['pickup_weekoftheyear'] = train_df.pickup_datetime.dt.weekofyear
test_df['pickup_hour'] = test_df.pickup_datetime.dt.hour
test_df = add_airport_dist(test_df)
test_df['bearing'] = sphere_dist_bear(test_df['pickup_latitude'], test_df['pickup_longitude'], 
                                    test_df['dropoff_latitude'] , test_df['dropoff_longitude'])  
test_df['pickup_latitude'] = radian_conv(test_df['pickup_latitude'])
test_df['pickup_longitude'] = radian_conv(test_df['pickup_longitude'])
test_df['dropoff_latitude'] = radian_conv(test_df['dropoff_latitude'])
test_df['dropoff_longitude'] = radian_conv(test_df['dropoff_longitude'])
                                                                    


In [None]:
sample_submission = pd.read_csv('../input/sample_submission.csv')

In [None]:
y_pred_final = model_lin.predict(test_df[['pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'passenger_count', 'distance',
       'pickup_month', 'pickup_weekaday', 'pickup_hour', 'jfk_dist',
       'ewr_dist', 'lga_dist', 'sol_dist', 'nyc_dist', 'bearing']].values)

submission = pd.DataFrame(
    {'key': sample_submission.key, 'fare_amount': y_pred_final},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission.csv', index = False)

In [None]:
test_df.keys