<a href="https://colab.research.google.com/github/kainat5008/ML-Assignment-01/blob/main/Assignment1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dataset Summary

- **Dataset Name**: New York Taxi Trip enriched by Mathematica
- **Source**: Kaggle
- **Domain**: Transportation
- **Target Variable**: `tripDuration` (regression)
- **Number of Features**: 24
- **Number of Records**: ~1.05 million
- **Problem Type**: Regression
- **Dataset Link**: [Kaggle Dataset Link](https://www.kaggle.com/datasets/wol4aravio/ny-taxi-trip-duration-enriched-by-mathematica)

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/kainat5008/ML-Assignment1/blob/main/Assignment1.ipynb)

In [1]:
import pandas as pd

# Load the dataset from Google Drive
file_url = 'https://drive.google.com/uc?export=download&id=1-kupu4E9ObvD3zN_5AEaP8jArUVgH8PD'
df = pd.read_csv(file_url)

# Display the first 5 rows
print(df.head())

Empty DataFrame
Index: []


In [2]:
# Basic information
print("Number of records:", df.shape[0])
print("Number of features:", df.shape[1])
print("\nData types:")
print(df.dtypes)

# Summary statistics for numeric features
print("\nSummary statistics:")
print(df.describe())

Number of records: 0
Number of features: 3

Data types:
sans-serif;margin:0}.grecaptcha-badge{visibility:hidden}.uc-main{padding-top:50px;text-align:center}#uc-dl-icon{display:inline-block;margin-top:16px;padding-right:1em;vertical-align:top}#uc-text{display:inline-block;max-width:68ex;text-align:left}.uc-error-caption                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        

In [3]:
import matplotlib.pyplot as plt
import seaborn as sns

# Plot histograms for numeric features
numeric_features = df.select_dtypes(include=['float64', 'int64']).columns
df[numeric_features].hist(bins=30, figsize=(15, 10))
plt.suptitle("Histograms of Numeric Features")
plt.show()

ValueError: hist method requires numerical or datetime columns, nothing to plot.

In [None]:
# Scatter plot for numeric features vs target variable
target_variable = 'trip_duration'  # Replace with your target variable
for feature in numeric_features:
    if feature != target_variable:
        plt.figure(figsize=(6, 4))
        sns.scatterplot(x=df[feature], y=df[target_variable])
        plt.title(f"{feature} vs {target_variable}")
        plt.show()

In [None]:
# Correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(df[numeric_features].corr(), annot=True, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix")
plt.show()

In [None]:
# Boxplots for numeric features
plt.figure(figsize=(15, 8))
df[numeric_features].boxplot()
plt.title("Boxplots of Numeric Features")
plt.xticks(rotation=45)
plt.show()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())

# Handle missing values (example: fill with median for numeric features)
df.fillna(df.median(), inplace=True)

# Verify no missing values remain
print("\nMissing values after handling:")
print(df.isnull().sum())

In [None]:
# Function to detect outliers using IQR
def detect_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = column[(column < lower_bound) | (column > upper_bound)]
    return outliers

# Detect outliers in numeric features
for feature in numeric_features:
    outliers = detect_outliers(df[feature])
    print(f"Outliers in {feature}: {len(outliers)}")

In [None]:
# Correlation with target variable
correlation_with_target = df[numeric_features].corr()[target_variable].sort_values(ascending=False)
print("Correlation with target variable:")
print(correlation_with_target)

# Visualize feature importance
plt.figure(figsize=(10, 6))
sns.barplot(x=correlation_with_target.index, y=correlation_with_target.values)
plt.title("Feature Importance (Correlation with Target)")
plt.xticks(rotation=45)
plt.show()