In [13]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, levene, ttest_ind

In [4]:
# first 5 rows of dataset
car_data = pd.read_csv("../car_details_v4.csv")

car_data.head()

Unnamed: 0,Make,Model,Price,Year,Kilometer,Fuel Type,Transmission,Location,Color,Owner,Seller Type,Engine,Max Power,Max Torque,Drivetrain,Length,Width,Height,Seating Capacity,Fuel Tank Capacity
0,Honda,Amaze 1.2 VX i-VTEC,505000,2017,87150,Petrol,Manual,Pune,Grey,First,Corporate,1198 cc,87 bhp @ 6000 rpm,109 Nm @ 4500 rpm,FWD,3990.0,1680.0,1505.0,5.0,35.0
1,Maruti Suzuki,Swift DZire VDI,450000,2014,75000,Diesel,Manual,Ludhiana,White,Second,Individual,1248 cc,74 bhp @ 4000 rpm,190 Nm @ 2000 rpm,FWD,3995.0,1695.0,1555.0,5.0,42.0
2,Hyundai,i10 Magna 1.2 Kappa2,220000,2011,67000,Petrol,Manual,Lucknow,Maroon,First,Individual,1197 cc,79 bhp @ 6000 rpm,112.7619 Nm @ 4000 rpm,FWD,3585.0,1595.0,1550.0,5.0,35.0
3,Toyota,Glanza G,799000,2019,37500,Petrol,Manual,Mangalore,Red,First,Individual,1197 cc,82 bhp @ 6000 rpm,113 Nm @ 4200 rpm,FWD,3995.0,1745.0,1510.0,5.0,37.0
4,Toyota,Innova 2.4 VX 7 STR [2016-2020],1950000,2018,69000,Diesel,Manual,Mumbai,Grey,First,Individual,2393 cc,148 bhp @ 3400 rpm,343 Nm @ 1400 rpm,RWD,4735.0,1830.0,1795.0,7.0,55.0


In [None]:
schema = {
    "Price": {"type": "int", "min": 50000, "max": 5000000},
    "Year": {"type": "int", "min": 1990, "max": 2025},
    "Kilometer": {"type": "int", "min": 0, "max": 500000},
    "Length": {"type": "float", "min": 2000, "max": 6000},
    "Width": {"type": "float", "min": 1000, "max": 2500},
    "Height": {"type": "float", "min": 1000, "max": 2500},
    "Seating Capacity": {"type": "int", "min": 2, "max": 10},
    "Fuel Tank Capacity": {"type": "float", "min": 10, "max": 100}
}


In [7]:


def validate_schema(df, schema):
    errors = []

    for col, rules in schema.items():

        # Check column existence
        if col not in df.columns:
            errors.append(f"Missing column: {col}")
            continue

        # Convert type
        expected_type = rules["type"]
        if expected_type == "int":
            df[col] = pd.to_numeric(df[col], errors="coerce").astype("Int64")
        elif expected_type == "float":
            df[col] = pd.to_numeric(df[col], errors="coerce")

        # Check for invalid values (NaN after conversion)
        if df[col].isna().sum() > 0:
            errors.append(f"{col}: contains non-numeric values")

        # Range check
        min_val = rules["min"]
        max_val = rules["max"]

        if (df[col] < min_val).any():
            errors.append(f"{col}: values smaller than {min_val}")

        if (df[col] > max_val).any():
            errors.append(f"{col}: values larger than {max_val}")

    # Report
    if len(errors) == 0:
        print("Schema validation PASSED")
    else:
        print("Schema validation FAILED")
        for e in errors:
            print(e)


In [8]:
validate_schema(car_data, schema)

Schema validation FAILED
Price: values smaller than 50000
Price: values larger than 5000000
Year: values smaller than 1990
Kilometer: values larger than 500000
Engine: contains non-numeric values
Length: contains non-numeric values
Width: contains non-numeric values
Height: contains non-numeric values
Seating Capacity: contains non-numeric values
Fuel Tank Capacity: contains non-numeric values
Fuel Tank Capacity: values larger than 100


In [10]:
petrol = car_data[car_data["Fuel Type"] == "Petrol"]["Price"]
diesel = car_data[car_data["Fuel Type"] == "Diesel"]["Price"]

In [15]:
shap_p_petrol = shapiro(petrol)
shap_p_diesel = shapiro(diesel)

print("Shapiro Petrol:", shap_p_petrol)
print("Shapiro Diesel:", shap_p_diesel)

if shap_p_petrol.pvalue > 0.05:
    print("Petrol prices are normally distributed")
else:
    print("Petrol prices are not normally distributed")

if shap_p_diesel.pvalue > 0.05:
    print("Diesel prices are normally distributed")
else:
    print("Diesel prices are not normally distributed")

Shapiro Petrol: ShapiroResult(statistic=np.float64(0.3958002747489967), pvalue=np.float64(3.057769323937258e-48))
Shapiro Diesel: ShapiroResult(statistic=np.float64(0.6936999618669818), pvalue=np.float64(6.665968224194606e-40))
Petrol prices are not normally distributed
Diesel prices are not normally distributed


In [16]:
# levene test
lev = levene(petrol, diesel)
print("Levene:", lev)
if lev.pvalue > 0.05:
    print("Variances are equal")
else:
    print("Variances are not equal")

Levene: LeveneResult(statistic=np.float64(19.36537916231321), pvalue=np.float64(1.1366137521920666e-05))
Variances are not equal


In [17]:
# t-test
tstat, pvalue = ttest_ind(petrol, diesel, equal_var=(lev.pvalue > 0.05))
print("T-test:", tstat, pvalue)
alpha = 0.05
if pvalue < alpha:
    print("Reject null hypothesis: Mean prices are different")
else:
    print("Fail to reject null hypothesis: Mean prices are the same")



T-test: -7.155793725422751 1.191166800584683e-12
Reject null hypothesis: Mean prices are different
