In [35]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/autotel-shared-car-locations/2020_02_25.csv


**Building a three-dimensional linear model**

In [73]:
import plotly.express as px
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline, make_pipeline

import plotly.graph_objects as go

In [37]:
df= pd.read_csv("/kaggle/input/autotel-shared-car-locations/2020_02_25.csv")
print("df shape:", df.shape)
print(df.info())
df.head()

df shape: (20049198, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20049198 entries, 0 to 20049197
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   latitude    float64
 1   longitude   float64
 2   total_cars  int64  
 3   cars_list   object 
 4   timestamp   object 
dtypes: float64(2), int64(1), object(2)
memory usage: 764.8+ MB
None


Unnamed: 0,latitude,longitude,total_cars,cars_list,timestamp
0,32.09004,34.79679,1,[138],2020-02-25 10:48:02 UTC
1,32.11614,34.79464,1,[64],2020-02-25 10:48:02 UTC
2,32.11876,34.82229,1,[120],2020-02-25 10:48:02 UTC
3,32.11226,34.83799,1,[110],2020-02-25 10:48:02 UTC
4,32.14279,34.79343,1,[190],2020-02-25 10:48:02 UTC


Split Data into X_train (feature matrix) and y_train(target vector)

In [38]:
features= ["latitude", "longitude"]
X_train= df[features]
X_train.head()

Unnamed: 0,latitude,longitude
0,32.09004,34.79679
1,32.11614,34.79464
2,32.11876,34.82229
3,32.11226,34.83799
4,32.14279,34.79343


In [39]:
target= "total_cars"
y_train= df[target]
y_train.head(10)

0    1
1    1
2    1
3    1
4    1
5    0
6    0
7    0
8    0
9    0
Name: total_cars, dtype: int64

**Build model**

In [41]:
#target vector mean
y_mean= y_train.mean()
y_mean

0.6403020210583984

**Baseline**

In [42]:
y_pred_baseline= [y_mean]* len(y_train)
print("y_pred_baseline:", y_pred_baseline[:5])

y_pred_baseline: [0.6403020210583984, 0.6403020210583984, 0.6403020210583984, 0.6403020210583984, 0.6403020210583984]


**Baseline mean absolute error as compared to y_train**


In [43]:
Baseline_mae= mean_absolute_error(y_train, y_pred_baseline)
print("Baseline mae:", round(Baseline_mae, 2))

Baseline mae: 0.59


**Instantiate SimpleImputer to take care of missing values**

In [44]:
imputer= SimpleImputer()

In [45]:
df[["longitude", "longitude"]].isnull().sum()

longitude    0
longitude    0
dtype: int64

In [46]:
#Fit X_train to transformer
imputer.fit(X_train)

In [47]:
#Tranform the Feature Matrix as the result to XT_train
XT_train= imputer.transform(X_train)
#convert transformed X_train to a DataFrame
pd.DataFrame(XT_train, columns= X_train.columns).info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20049198 entries, 0 to 20049197
Data columns (total 2 columns):
 #   Column     Dtype  
---  ------     -----  
 0   latitude   float64
 1   longitude  float64
dtypes: float64(2)
memory usage: 305.9 MB


**Build a model**

In [48]:
#Should contain both imputer and predictor(LinearRegression)
model= make_pipeline(
    SimpleImputer(),
    LinearRegression()

)
print("model type:", type(model))
print(model)

model type: <class 'sklearn.pipeline.Pipeline'>
Pipeline(steps=[('simpleimputer', SimpleImputer()),
                ('linearregression', LinearRegression())])


In [49]:
#Fit the model to both X_train and y_train
model.fit(X_train, y_train)

In [51]:
#Create a list of predictions for X_train, assign it to y_pred_training
y_pred_training= model.predict(X_train)
y_pred_training

array([0.65804233, 0.61892134, 0.70871674, ..., 0.63569535, 0.60941912,
       0.6209162 ])

In [52]:
#Training mean absolute error for y_pred_training as compared to the y_train
Training_mae= mean_absolute_error(y_train, y_pred_training)
Training_mae

0.5861343330372385

In [53]:
#Read X_train into a DataFrame
X_train= pd.read_csv("/kaggle/input/autotel-shared-car-locations/2020_02_25.csv")[features]
X_train.head()

Unnamed: 0,latitude,longitude
0,32.09004,34.79679
1,32.11614,34.79464
2,32.11876,34.82229
3,32.11226,34.83799
4,32.14279,34.79343


In [55]:
#Generate a Series of Predictions using the newly created model
y_pred_training= pd.Series(model.predict(X_train))
print("predictions type:", type(y_pred_training))

predictions type: <class 'pandas.core.series.Series'>


In [63]:
#Extract Intercept and Coefficient for your model
intercept= model.named_steps["linearregression"].intercept_.round(2)
coefficients= model.named_steps["linearregression"].coef_.round(2)
coefficients

array([-1.22,  3.36])

In [64]:
#Print the equation that your model has determined for predicting the total number of cars
#based on latitudes and longitudes
print(f"cars= {intercept} + ({coefficients}* longitude)+ ({coefficients}* latitude)")

cars= -77.17 + ([-1.22  3.36]* longitude)+ ([-1.22  3.36]* latitude)


In [71]:
#y and x coordinates for model representation
x_plane = np.linspace(df["longitude"].min(), df["longitude"].max(), 10)
y_plane = np.linspace(df["latitude"].min(), df["latitude"].max(), 10)
xx, yy = np.meshgrid(x_plane, y_plane)
#use model to predict z coordinates
z_plane = model.predict(pd.DataFrame({"latitude": x_plane, "longitude": y_plane}))
zz = np.tile(z_plane, (10, 1))

array([[-11.99960144, -11.97470242, -11.94980339, -11.92490436,
        -11.90000533, -11.87510631, -11.85020728, -11.82530825,
        -11.80040922, -11.7755102 ],
       [-11.99960144, -11.97470242, -11.94980339, -11.92490436,
        -11.90000533, -11.87510631, -11.85020728, -11.82530825,
        -11.80040922, -11.7755102 ],
       [-11.99960144, -11.97470242, -11.94980339, -11.92490436,
        -11.90000533, -11.87510631, -11.85020728, -11.82530825,
        -11.80040922, -11.7755102 ],
       [-11.99960144, -11.97470242, -11.94980339, -11.92490436,
        -11.90000533, -11.87510631, -11.85020728, -11.82530825,
        -11.80040922, -11.7755102 ],
       [-11.99960144, -11.97470242, -11.94980339, -11.92490436,
        -11.90000533, -11.87510631, -11.85020728, -11.82530825,
        -11.80040922, -11.7755102 ],
       [-11.99960144, -11.97470242, -11.94980339, -11.92490436,
        -11.90000533, -11.87510631, -11.85020728, -11.82530825,
        -11.80040922, -11.7755102 ],
       [-1