In [1]:
import pandas as pd
from sklearn.model_selection import KFold
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_validate


In [5]:
initial_data = pd.read_csv('../data/cross_validation/initial_data.csv', index_col='id')

initial_cols = ['vendor_id', 'passenger_count', 'pickup_longitude',
                'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
                'trip_duration']

initial_data = initial_data[initial_cols]

initial_data = initial_data.assign(log_trip_duration=np.log1p(initial_data['trip_duration']))
initial_data = initial_data.drop('trip_duration', axis=1)


X = initial_data.drop('log_trip_duration', axis=1)
y = initial_data['log_trip_duration']

X.head()

Unnamed: 0_level_0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
id2875421,2,1,-73.982155,40.767937,-73.96463,40.765602
id2377394,1,1,-73.980415,40.738564,-73.999481,40.731152
id3858529,2,1,-73.979027,40.763939,-74.005333,40.710087
id3504673,2,1,-74.01004,40.719971,-74.012268,40.706718
id2181028,2,1,-73.973053,40.793209,-73.972923,40.78252


In [6]:
splitter = KFold(n_splits=20, shuffle=True, random_state=33)

losses_test = []

for train_index, test_index in splitter.split(X):

    X_train, X_test = X.values[train_index], X.values[test_index]
    y_train, y_test = y.values[train_index], y.values[test_index]
    
    model = LinearRegression()
    model.fit(X_train, y_train)
    
    losses_test.append(    
        mean_squared_error(model.predict(X_test), y_test)
    )

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.2, 
                                                    random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

test_error = np.mean(
    mean_squared_error(model.predict(X_test), y_test)
)

In [8]:
print(f'{np.mean(losses_test):.3f}, {test_error:.3f}')

0.613, 0.606


## Processed data

In [3]:
processed_data = pd.read_csv('../data/cross_validation/processed_data.csv', index_col='id')

processed_data = processed_data.assign(log_trip_duration=np.log1p(processed_data['trip_duration']))
processed_data = processed_data.drop('trip_duration', axis=1)


X_2 = processed_data.drop('log_trip_duration', axis=1)
y_2 = processed_data['log_trip_duration']

X_2.head()

Unnamed: 0_level_0,vendor_id,passenger_count,store_and_fwd_flag,distance_km
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
id2875421,1,930.399753,0,1.500479
id2377394,0,930.399753,0,1.807119
id3858529,1,930.399753,0,6.39208
id3504673,1,930.399753,0,1.487155
id2181028,1,930.399753,0,1.189925


In [9]:
### Важно! Когда сравниваем модели по их качеству
### на валидации и на тесте, не шаффлим данные заново!


test_indexes = X_test.index
train_indexes = X_train.index

X_train_2 = X_2[X_2.index.isin(train_indexes)]
y_train_2 = y_2[y_2.index.isin(train_indexes)]

X_test_2 = X_2[X_2.index.isin(test_indexes)]
y_test_2 = y_2[y_2.index.isin(test_indexes)]

### Linear Regression on all train in refreshed dataset

In [11]:
model_2 = LinearRegression()
model_2.fit(X_train_2, y_train_2)

model_2_error = mean_squared_error(model_2.predict(X_test_2), y_test_2)
print(f'{model_2_error:.3f}')

0.407


### Cross-validation

In [18]:
losses_test_2 = []

for train_index, test_index in splitter.split(X):

    X_train_2, X_test_2 = X_2.values[train_index], X_2.values[test_index]
    y_train_2, y_test_2 = y_2.values[train_index], y_2.values[test_index]
    
    model = LinearRegression()
    model.fit(X_train_2, y_train_2)
    
    losses_test_2.append(    
        mean_squared_error(model.predict(X_test_2), y_test_2)
    )

In [19]:
print(f'{np.mean(losses_test_2):.3f}, {model_2_error:.3f}')

0.427, 0.407


### lib function to cross-validation

In [21]:
from sklearn.model_selection import cross_validate

In [36]:
cv_scores = cross_validate(
    LinearRegression(), 
    X_train_2, y_train_2, cv=20, 
    return_train_score=True, 
    scoring='neg_mean_squared_error'
)

In [39]:
cv_scores['test_score'] * -1

array([0.39546461, 0.39248636, 0.67007654, 0.40600352, 0.40181993,
       0.43428758, 0.39126649, 0.38625133, 0.40853436, 0.39534413,
       0.38617427, 0.397283  , 0.50004688, 0.55669315, 0.40678959,
       0.44119478, 0.3911642 , 0.39376648, 0.40206466, 0.3885038 ])

In [40]:
predictions = cv_scores['test_score'] * -1
np.mean(predictions)

0.4272607823880811

In [35]:
import sklearn
sklearn.metrics.get_scorer_names()

['accuracy',
 'adjusted_mutual_info_score',
 'adjusted_rand_score',
 'average_precision',
 'balanced_accuracy',
 'completeness_score',
 'explained_variance',
 'f1',
 'f1_macro',
 'f1_micro',
 'f1_samples',
 'f1_weighted',
 'fowlkes_mallows_score',
 'homogeneity_score',
 'jaccard',
 'jaccard_macro',
 'jaccard_micro',
 'jaccard_samples',
 'jaccard_weighted',
 'matthews_corrcoef',
 'max_error',
 'mutual_info_score',
 'neg_brier_score',
 'neg_log_loss',
 'neg_mean_absolute_error',
 'neg_mean_absolute_percentage_error',
 'neg_mean_gamma_deviance',
 'neg_mean_poisson_deviance',
 'neg_mean_squared_error',
 'neg_mean_squared_log_error',
 'neg_median_absolute_error',
 'neg_negative_likelihood_ratio',
 'neg_root_mean_squared_error',
 'normalized_mutual_info_score',
 'positive_likelihood_ratio',
 'precision',
 'precision_macro',
 'precision_micro',
 'precision_samples',
 'precision_weighted',
 'r2',
 'rand_score',
 'recall',
 'recall_macro',
 'recall_micro',
 'recall_samples',
 'recall_weighted',