In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/dynamicpricing-data/dynamic_pricing.csv')

# Check the structure of the dataset
print("Shape of the dataset:", df.shape)
print("First few rows:")
print(df.head())

In [None]:
# Basic summary statistics
print("Summary Statistics:")
print(df.describe(include='all'))

In [None]:
print("Missing Values:")
print(df.isnull().sum())

# Visualize missing data (requires seaborn)
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))
sns.heatmap(df.isnull(), cbar=False, cmap="viridis")
plt.title("Missing Values Heatmap")
plt.show()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Set up the plotting environment
sns.set(style="whitegrid")

# Summary statistics of the dataset
summary_stats = df.describe()

# Distribution of numerical features
numerical_cols = ['Number_of_Riders', 'Number_of_Drivers', 'Number_of_Past_Rides', 
                  'Average_Ratings', 'Expected_Ride_Duration', 'Historical_Cost_of_Ride']

# Create histograms for numerical features
for col in numerical_cols:
    plt.figure(figsize=(8, 4))
    sns.histplot(df[col], kde=True, bins=30)
    plt.title(f"Distribution of {col}")
    plt.xlabel(col)
    plt.ylabel("Frequency")
    plt.show()

# Visualize correlations between numerical features
plt.figure(figsize=(10, 6))
corr_matrix = df[numerical_cols].corr()
sns.heatmap(corr_matrix, annot=True, fmt=".2f", cmap="coolwarm")
plt.title("Correlation Matrix")
plt.show()

# Categorical feature analysis
categorical_cols = ['Location_Category', 'Customer_Loyalty_Status', 
                    'Time_of_Booking', 'Vehicle_Type']

# Bar plots for categorical features
for col in categorical_cols:
    plt.figure(figsize=(8, 4))
    sns.countplot(data=df, x=col, order=df[col].value_counts().index)
    plt.title(f"Distribution of {col}")
    plt.xticks(rotation=45)
    plt.show()

# Relationship between Historical_Cost_of_Ride and other features
# Scatter plot for numerical relationships
for col in numerical_cols[:-1]:
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x=col, y='Historical_Cost_of_Ride', alpha=0.7)
    plt.title(f"Historical Cost of Ride vs {col}")
    plt.xlabel(col)
    plt.ylabel("Historical Cost of Ride")
    plt.show()

# Box plots for categorical relationships
for col in categorical_cols:
    plt.figure(figsize=(8, 6))
    sns.boxplot(data=df, x=col, y='Historical_Cost_of_Ride')
    plt.title(f"Historical Cost of Ride vs {col}")
    plt.xticks(rotation=45)
    plt.show()

summary_stats


In [None]:
# Feature Engineering

# Calculate the driver-to-rider ratio
df['Driver_to_Rider_Ratio'] = df['Number_of_Drivers'] / df['Number_of_Riders']

# Create a feature for ride cost per minute

df['Cost_per_Minute'] = df['Historical_Cost_of_Ride'] / df['Expected_Ride_Duration']

# Encode categorical variables using one-hot encoding
encoded_df = pd.get_dummies(df, columns=['Location_Category', 'Customer_Loyalty_Status', 
                                         'Time_of_Booking', 'Vehicle_Type'], drop_first=True)

# Drop redundant columns if necessary
encoded_df = encoded_df.drop(columns=['Historical_Cost_of_Ride'])

# Display the first few rows of the engineered dataset
encoded_df.head()


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Target variable
target = 'Historical_Cost_of_Ride'

# Features and target split
X = encoded_df
y = df[target]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model initialization
model = RandomForestRegressor(random_state=42, n_estimators=100)

# Train the model
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5
r2 = r2_score(y_test, y_pred)

mae, rmse, r2


In [None]:
# Extract feature importance
feature_importances = model.feature_importances_
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10, 6))
sns.barplot(data=importance_df, x='Importance', y='Feature', palette='viridis')
plt.title('Feature Importance for Dynamic Pricing Model')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.tight_layout()
plt.show()

importance_df


In [None]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid for optimization
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_grid=param_grid,
    cv=3,
    scoring='neg_mean_squared_error',
    n_jobs=-1,
    verbose=2
)

# Perform grid search
grid_search.fit(X_train, y_train)

# Best parameters and best score
best_params = grid_search.best_params_
best_score = -grid_search.best_score_  # Negative MSE to positive

best_params, best_score


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Retrain the model using the best parameters
best_model = RandomForestRegressor(
    max_depth=None,
    min_samples_leaf=1,
    min_samples_split=2,
    n_estimators=200,
    random_state=42
)

# Train the model
best_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred_best = best_model.predict(X_test)

# Evaluate the tuned model
mae_best = mean_absolute_error(y_test, y_pred_best)
mse_best = mean_squared_error(y_test, y_pred_best)
rmse_best = mse_best ** 0.5
r2_best = r2_score(y_test, y_pred_best)

mae_best, rmse_best, r2_best


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot actual vs. predicted values
plt.figure(figsize=(10, 6))
sns.scatterplot(x=y_test, y=y_pred_best, alpha=0.7)
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label='Perfect Prediction')
plt.title('Predicted vs. Actual Ride Costs')
plt.xlabel('Actual Ride Cost')
plt.ylabel('Predicted Ride Cost')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import joblib

# Save the model
joblib.dump(best_model, 'dynamic_pricing_model.pkl')

