In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

taxiDB = pd.read_csv('taxi_dataset.csv')

In [4]:
taxiDB.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N


<dl>
<dt> Описание колонок:
<dd>id - ID поездки </dd>
<dd>vendor_id - ID компании, осуществляющей перевозку </dd>
<dd>pickup_datetime - Таймкод начала поездки</dd>
<dd>dropoff_datetime - Таймкод конца поездки </dd>
<dd>passenger_count - Количество пассажиров </dd>
<dd>pickup_longitude - Долгота точки, в которой началась поездка </dd>
<dd>pickup_latitude - Широта точки, в которой началась поездка </dd>
<dd>dropoff_longitude - Долгота точки, в которой закончилась поездка </dd>
<dd>dropoff_latitude - Широта точки, в которой закончилась поездка </dd>
<dd>store_and_fwd_flag - Yes/No: Была ли информация сохранена в памяти транспортного средства из-за потери соединения с сервером </dd>
</dl>

In [5]:
#создаем колонку "продолжительность поездки"
taxiDB['trip_duration'] = (pd.to_datetime(taxiDB['dropoff_datetime']) - \
                           pd.to_datetime(taxiDB['pickup_datetime'])).dt.total_seconds()

taxiDB.head()

Unnamed: 0,id,vendor_id,pickup_datetime,dropoff_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,2,2016-03-14 17:24:55,2016-03-14 17:32:30,1,-73.982155,40.767937,-73.96463,40.765602,N,455.0
1,id2377394,1,2016-06-12 00:43:35,2016-06-12 00:54:38,1,-73.980415,40.738564,-73.999481,40.731152,N,663.0
2,id3858529,2,2016-01-19 11:35:24,2016-01-19 12:10:48,1,-73.979027,40.763939,-74.005333,40.710087,N,2124.0
3,id3504673,2,2016-04-06 19:32:31,2016-04-06 19:39:40,1,-74.01004,40.719971,-74.012268,40.706718,N,429.0
4,id2181028,2,2016-03-26 13:30:55,2016-03-26 13:38:10,1,-73.973053,40.793209,-73.972923,40.78252,N,435.0


In [6]:
#удаляем dropoff_datetime, чтобы убрать data leakage
taxiDB = taxiDB.drop('dropoff_datetime', axis=1)

In [7]:
taxiDB.shape

(1458644, 10)

In [8]:
taxiDB.set_index('id').to_csv('initial_data.csv')

In [9]:
#приводим к виду 0/1
taxiDB['vendor_id'] = taxiDB['vendor_id'] - 1
taxiDB.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,1,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,N,455.0
1,id2377394,0,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,N,663.0
2,id3858529,1,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,N,2124.0
3,id3504673,1,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,N,429.0
4,id2181028,1,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,N,435.0


In [10]:
#приводим к виду 0/1
taxiDB['store_and_fwd_flag'] = taxiDB['store_and_fwd_flag'].apply(lambda x: 0 if x=='N' else 1)
taxiDB.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,1,2016-03-14 17:24:55,1,-73.982155,40.767937,-73.96463,40.765602,0,455.0
1,id2377394,0,2016-06-12 00:43:35,1,-73.980415,40.738564,-73.999481,40.731152,0,663.0
2,id3858529,1,2016-01-19 11:35:24,1,-73.979027,40.763939,-74.005333,40.710087,0,2124.0
3,id3504673,1,2016-04-06 19:32:31,1,-74.01004,40.719971,-74.012268,40.706718,0,429.0
4,id2181028,1,2016-03-26 13:30:55,1,-73.973053,40.793209,-73.972923,40.78252,0,435.0


In [11]:
allLat  = list(taxiDB['pickup_latitude']) + list(taxiDB['dropoff_latitude'])

In [12]:
#медианная широта
medianLat  = sorted(allLat)[int(len(allLat)/2)]

In [13]:
#перевод широты в километры
latMultiplier  = 111.32

taxiDB['pickup_latitude']   = latMultiplier  * (taxiDB['pickup_latitude']   - medianLat)
taxiDB['dropoff_latitude']   = latMultiplier  * (taxiDB['dropoff_latitude']  - medianLat)

In [14]:
#на сколько примерно километров севернее или южнее (в зависимости от знака) точка находится относительно медианной широты
taxiDB.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,1,2016-03-14 17:24:55,1,-73.982155,1.516008,-73.96463,1.256121,0,455.0
1,id2377394,0,2016-06-12 00:43:35,1,-73.980415,-1.753813,-73.999481,-2.578912,0,663.0
2,id3858529,1,2016-01-19 11:35:24,1,-73.979027,1.070973,-74.005333,-4.923841,0,2124.0
3,id3504673,1,2016-04-06 19:32:31,1,-74.01004,-3.823568,-74.012268,-5.298809,0,429.0
4,id2181028,1,2016-03-26 13:30:55,1,-73.973053,4.329328,-73.972923,3.139453,0,435.0


In [15]:
#аналогично для долготы
allLong = list(taxiDB['pickup_longitude']) + list(taxiDB['dropoff_longitude'])

medianLong  = sorted(allLong)[int(len(allLong)/2)]

longMultiplier = np.cos(medianLat*(np.pi/180.0)) * 111.32

In [16]:
taxiDB['pickup_longitude']  = longMultiplier * (taxiDB['pickup_longitude']  - medianLong)
taxiDB['dropoff_longitude']  = longMultiplier * (taxiDB['dropoff_longitude'] - medianLong)

taxiDB.head(5)

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration
0,id2875421,1,2016-03-14 17:24:55,1,-0.110015,1.516008,1.367786,1.256121,0,455.0
1,id2377394,0,2016-06-12 00:43:35,1,0.036672,-1.753813,-1.571088,-2.578912,0,663.0
2,id3858529,1,2016-01-19 11:35:24,1,0.153763,1.070973,-2.064547,-4.923841,0,2124.0
3,id3504673,1,2016-04-06 19:32:31,1,-2.4615,-3.823568,-2.649362,-5.298809,0,429.0
4,id2181028,1,2016-03-26 13:30:55,1,0.657515,4.329328,0.668452,3.139453,0,435.0


In [17]:
#вычисление географического расстояния "distance_km":
taxiDB['long_diff'] = taxiDB['dropoff_longitude'] - taxiDB['pickup_longitude']
taxiDB['lat_diff'] = taxiDB['dropoff_latitude'] - taxiDB['pickup_latitude']

taxiDB['distance_km'] = (taxiDB['long_diff']**2 + taxiDB['lat_diff']**2)**(1/2)

taxiDB = taxiDB.drop(['long_diff', 'lat_diff'], axis=1)

In [18]:
taxiDB.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,distance_km
0,id2875421,1,2016-03-14 17:24:55,1,-0.110015,1.516008,1.367786,1.256121,0,455.0,1.500479
1,id2377394,0,2016-06-12 00:43:35,1,0.036672,-1.753813,-1.571088,-2.578912,0,663.0,1.807119
2,id3858529,1,2016-01-19 11:35:24,1,0.153763,1.070973,-2.064547,-4.923841,0,2124.0,6.39208
3,id3504673,1,2016-04-06 19:32:31,1,-2.4615,-3.823568,-2.649362,-5.298809,0,429.0,1.487155
4,id2181028,1,2016-03-26 13:30:55,1,0.657515,4.329328,0.668452,3.139453,0,435.0,1.189925


In [19]:
#убираем старые признаки
taxiDB = taxiDB.drop(['pickup_longitude', 'dropoff_longitude',
                      'pickup_latitude', 'dropoff_latitude'], axis=1)

In [20]:
taxiDB.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,store_and_fwd_flag,trip_duration,distance_km
0,id2875421,1,2016-03-14 17:24:55,1,0,455.0,1.500479
1,id2377394,0,2016-06-12 00:43:35,1,0,663.0,1.807119
2,id3858529,1,2016-01-19 11:35:24,1,0,2124.0,6.39208
3,id3504673,1,2016-04-06 19:32:31,1,0,429.0,1.487155
4,id2181028,1,2016-03-26 13:30:55,1,0,435.0,1.189925


In [21]:
taxiDB['passenger_count'].value_counts()

passenger_count
1    1033540
2     210318
5      78088
3      59896
6      48333
4      28404
0         60
7          3
9          1
8          1
Name: count, dtype: int64

In [22]:
#mean target encoding количества пассажиров
taxiDB['passenger_count'] = taxiDB['passenger_count'].map(taxiDB
                                                          .groupby('passenger_count')['trip_duration']
                                                          .mean())

In [23]:
taxiDB.head()

Unnamed: 0,id,vendor_id,pickup_datetime,passenger_count,store_and_fwd_flag,trip_duration,distance_km
0,id2875421,1,2016-03-14 17:24:55,930.399753,0,455.0,1.500479
1,id2377394,0,2016-06-12 00:43:35,930.399753,0,663.0,1.807119
2,id3858529,1,2016-01-19 11:35:24,930.399753,0,2124.0,6.39208
3,id3504673,1,2016-04-06 19:32:31,930.399753,0,429.0,1.487155
4,id2181028,1,2016-03-26 13:30:55,930.399753,0,435.0,1.189925


In [24]:
taxiDB = taxiDB.set_index('id')

In [25]:
taxiDB.head()

Unnamed: 0_level_0,vendor_id,pickup_datetime,passenger_count,store_and_fwd_flag,trip_duration,distance_km
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id2875421,1,2016-03-14 17:24:55,930.399753,0,455.0,1.500479
id2377394,0,2016-06-12 00:43:35,930.399753,0,663.0,1.807119
id3858529,1,2016-01-19 11:35:24,930.399753,0,2124.0,6.39208
id3504673,1,2016-04-06 19:32:31,930.399753,0,429.0,1.487155
id2181028,1,2016-03-26 13:30:55,930.399753,0,435.0,1.189925


In [26]:
#линейная регрессия из коробки
from sklearn.linear_model import LinearRegression

model = LinearRegression()
model.fit(taxiDB.drop(['trip_duration', 'pickup_datetime'], axis=1), taxiDB['trip_duration'])

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


In [27]:
#коэффициенты и свободный вес
model.coef_.tolist(), model.intercept_

([198.46313674443144, 0.296312947952014, 56.46912165456772, 115.273537630476],
 171.65658145584132)

In [28]:
#нахождение обратной матрицы методом Гаусса-Жордана 
def manual_inverse(A):
    A = A.astype(float)
    n = A.shape[0]
    I = np.eye(n)
    AI = np.hstack([A, I])  # augmented matrix

    for i in range(n):
        # Make the diagonal element 1
        AI[i] = AI[i] / AI[i, i]

        # Eliminate other rows
        for j in range(n):
            if i != j:
                AI[j] = AI[j] - AI[j, i] * AI[i]

    return AI[:, n:]

In [29]:
def Manual_inverse_regression(X, Y, fit_intercept=True):
    
    X = np.c_[ X, np.ones(X.shape[0]) ]
    
    return np.dot(np.dot(manual_inverse(np.dot(X.T, X)), X.T), Y)

In [33]:
#нахождение обратной матрицы методом разложения Холецкого
# Linear regression via Cholesky decomposition
def linear_regression_cholesky(X, y):
    #добавление bias
    X_b = np.c_[np.ones((X.shape[0], 1)), X]
    A = X_b.T @ X_b
    b = X_b.T @ y
    
    #разложение Холецкого (built-in in numpy) 
    L = np.linalg.cholesky(A)
    
    #нахождение коэффициентов z в СЛАУ
    z = np.linalg.solve(L, b)
    #обратная подстановка в аналитическое уравнение L^T*theta = z
    theta = np.linalg.solve(L.T, z)
    return theta


#нахождение обратной матрицы методом сингулярного разложения
def linear_regression_svd(X, y):
    #добавление bias
    X_b = np.c_[np.ones((X.shape[0], 1)), X]
    
    #SVD
    U, S, Vt = np.linalg.svd(X_b, full_matrices=False)
    
    #псевдообратная матрица
    S_inv = np.diag(1 / S)
    X_pseudo = Vt.T @ S_inv @ U.T
    theta = X_pseudo @ y
    return theta

print("Cholesky:", linear_regression_cholesky(X_train, Y_train).ravel())
print("SVD:", linear_regression_svd(X_train, Y_train).ravel())

Cholesky: [171.65658145 198.46313674   0.29631295  56.46912165 115.27353763]
SVD: [171.65658146 198.46313674   0.29631295  56.46912165 115.27353763]


In [30]:
X_train = taxiDB.drop(['trip_duration', 'pickup_datetime'], axis=1).values
Y_train = taxiDB['trip_duration'].values

Manual_inverse_regression(X_train, Y_train)

array([198.46313674,   0.29631295,  56.46912165, 115.27353763,
       171.65658145])

In [31]:
#реализация аналитического решения уравнения линейной регрессии (built-in methods)
def LinearRegressionByMatrix(X, Y, fit_intercept=True):
    
    X = np.c_[ X, np.ones(X.shape[0]) ]
    
    return np.dot(np.dot(np.linalg.inv(np.dot(X.T, X)), X.T), Y)

In [32]:
#значения ~совпадают со значениями, полученными из линейной регрессии "из коробки"
X_train = taxiDB.drop(['trip_duration', 'pickup_datetime'], axis=1).values
Y_train = taxiDB['trip_duration'].values

LinearRegressionByMatrix(X_train, Y_train)

array([198.46313674,   0.29631295,  56.46912165, 115.27353763,
       171.65658145])

In [33]:
betas = LinearRegressionByMatrix(X_train, Y_train)

np.dot(np.c_[ X_train, np.ones(X_train.shape[0]) ], betas)

array([ 818.7747282 ,  655.65912268, 1382.6469154 , ..., 1548.74134353,
        573.4306718 ,  578.2338068 ])