# Predict the price of the Uber ride from a given pickup point to the agreed drop-off location. Perform following tasks:

Pre-process the dataset.Identify outliers.Check the correlation.

Implement

i)                    linear regression, ii)                   random forest regression

iii)                 XGBRegressor,iv)                 Elastic Net Regression

v)                  Logistic Regression,vi)                 Poisson Regression

vii)               Negative Binomial Regression

Evaluate the models and compare their respective scores  R2, MSE , RMSE,  for each algorithm.

Plot graph of accuracy verse model. Discuss the more accurate model.

In [2]:
pip install numpy pandas seaborn 

Collecting seaborn
  Downloading seaborn-0.13.2-py3-none-any.whl (294 kB)
                                              0.0/294.9 kB ? eta -:--:--
                                              0.0/294.9 kB ? eta -:--:--
                                              0.0/294.9 kB ? eta -:--:--
     -----                                    41.0/294.9 kB ? eta -:--:--
     -------                               61.4/294.9 kB 656.4 kB/s eta 0:00:01
     ----------                            81.9/294.9 kB 657.6 kB/s eta 0:00:01
     -------------                        112.6/294.9 kB 595.3 kB/s eta 0:00:01
     --------------                       122.9/294.9 kB 554.9 kB/s eta 0:00:01
     --------------                       122.9/294.9 kB 554.9 kB/s eta 0:00:01
     -------------------                  163.8/294.9 kB 490.7 kB/s eta 0:00:01
     ---------------------                174.1/294.9 kB 456.4 kB/s eta 0:00:01
     -----------------------              194.6/294.9 kB 454.0 kB/s eta 0


[notice] A new release of pip is available: 23.1.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.linear_model import LinearRegression, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

Matplotlib is building the font cache; this may take a moment.


ModuleNotFoundError: No module named 'sklearn'

In [2]:
df = pd.read_csv('uber.csv')

In [None]:
print(df.columns)

In [None]:
df.info()

In [None]:
df

In [None]:
# 1. Pre-process the dataset


df.drop(['Unnamed: 0','key'],axis =1,inplace = True)
df.fillna(method='ffill', inplace=True)
df.drop(df[df['fare_amount'].values<=0].index,inplace=True)
df.drop(df[df['passenger_count']>10].index,inplace=True)
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek

missing_values = df.isnull().sum()
print("Missing values in the dataset:")
print(missing_values)

# Handle missing values
# We can choose to drop rows with missing values or fill them with appropriate values.

df.dropna(inplace=True)

# To fill missing values with the mean value of the column:
# data.fillna(data.mean(), inplace=True)

# Ensure there are no more missing values
missing_values = df.isnull().sum()
print("Missing values after handling:")
print(missing_values)

# 2. Identify outliers
# visualization to detect outliers.
sns.boxplot(x=df["fare_amount"])
plt.show()


In [None]:
df.describe()

In [8]:
# Remove rows where fare_amount is less than or equal to 0 (since Poisson can't handle that)
df = df[df['fare_amount'] > 0]

# Ensure no other invalid values like NaN are present in the dataset
df = df.replace([np.inf, -np.inf], np.nan)
df = df.dropna()

In [None]:
sns.boxplot(x=df["fare_amount"])
plt.show()

In [10]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace = True)
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace = True)

In [None]:
plt.figure(figsize=(10, 6))
plt.scatter(df['pickup_longitude'], df['pickup_latitude'], alpha=0.5)
plt.title('Pickup Locations')
plt.xlabel('Pickup Longitude')
plt.ylabel('Pickup Latitude')
plt.grid()
plt.xlim([-180, 180])  # Set longitude limits
plt.ylim([-90, 90])    # Set latitude limits
plt.show()

In [None]:
# Check for valid latitude and longitude values
invalid_latitude = df[(df['pickup_latitude'] < -90) | (df['pickup_latitude'] > 90)]
invalid_longitude = df[(df['pickup_longitude'] < -180) | (df['pickup_longitude'] > 180)]

print(f"Invalid Latitude Records:\n{invalid_latitude}")
print(f"Invalid Longitude Records:\n{invalid_longitude}")


In [13]:
df = df[
    (df['pickup_latitude'] >= -90) & (df['pickup_latitude'] <= 90) &
    (df['dropoff_latitude'] >= -90) & (df['dropoff_latitude'] <= 90) &
    (df['pickup_longitude'] >= -180) & (df['pickup_longitude'] <= 180) &
    (df['dropoff_longitude'] >= -180) & (df['dropoff_longitude'] <= 180)
]

In [None]:
# Check for valid latitude and longitude values
invalid_latitude = df[(df['pickup_latitude'] < -90) | (df['pickup_latitude'] > 90)]
invalid_longitude = df[(df['pickup_longitude'] < -180) | (df['pickup_longitude'] > 180)]

print(f"Invalid Latitude Records:\n{invalid_latitude}")
print(f"Invalid Longitude Records:\n{invalid_longitude}")

In [None]:
# Calculate the IQR for the 'fare_amount' column
Q1 = df["fare_amount"].quantile(0.25)
Q3 = df["fare_amount"].quantile(0.75)
IQR = Q3 - Q1

# Define a threshold (e.g., 1.5 times the IQR) to identify outliers
threshold = 1.5
lower_bound = Q1 - threshold * IQR
upper_bound = Q3 + threshold * IQR

# Remove outliers
data_no_outliers = df[(df["fare_amount"] >= lower_bound) & (df["fare_amount"] <= upper_bound)]

# Visualize the 'fare_amount' distribution without outliers
sns.boxplot(x=data_no_outliers["fare_amount"])
plt.show()

In [None]:
df.plot(kind="box",subplots=True, layout=(7, 2), figsize=(15, 20))

In [None]:
# 3. Check the correlation
# Determine the correlation between features and the target variable (fare_amount).
df_for_corr = df.drop(columns=[ 'pickup_datetime'])

correlation_matrix = df_for_corr.corr()
sns.heatmap(correlation_matrix, annot=True)
plt.show()

In [None]:
X = df[['pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']]
y = df['fare_amount']  #Target

y

In [19]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Create and train the linear regression model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [21]:
# 4. Implementing Models

# Initialize an empty dictionary to store results
results = {}

In [22]:
# (i) Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_lr = linear_model.predict(X_test)
results['Linear Regression'] = {
    'R2': r2_score(y_test, y_pred_lr),
    'MSE': mean_squared_error(y_test, y_pred_lr),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_lr))
}

In [23]:
#(ii) Random Forest Regressor
rf_model = RandomForestRegressor()
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
results['Random Forest'] = {
   'R2': r2_score(y_test, y_pred_rf),
   'MSE': mean_squared_error(y_test, y_pred_rf),
   'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_rf))
}

In [24]:
# (iii) XGBRegressor
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
results['XGB Regressor'] = {
    'R2': r2_score(y_test, y_pred_xgb),
    'MSE': mean_squared_error(y_test, y_pred_xgb),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_xgb))
}

In [25]:
# (iv) Elastic Net Regression
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)
y_pred_en = elastic_net.predict(X_test)
results['Elastic Net'] = {
    'R2': r2_score(y_test, y_pred_en),
    'MSE': mean_squared_error(y_test, y_pred_en),
    'RMSE': np.sqrt(mean_squared_error(y_test, y_pred_en))
}

In [None]:
# 5. Evaluate and compare models
results_df = pd.DataFrame(results).T
print(results_df)

In [None]:
# 6. Plot the results
plt.figure(figsize=(12, 6))
plt.plot(results_df.index, results_df['R2'], label='R2 Score', marker='o')
plt.plot(results_df.index, results_df['MSE'], label='MSE', marker='o')
plt.plot(results_df.index, results_df['RMSE'], label='RMSE', marker='o')
plt.title('Model Comparison')
plt.xlabel('Model')
plt.ylabel('Score')
plt.legend()
plt.grid(True)
plt.show()