# P2M3_jyotis_sugata_GX.ipynb

This notebook performs data validation using **Great Expectations** on the cleaned dataset `data/P2M3_jyotis_sugata_data_clean.csv`.

**Requirements met:** 7 Expectations total; includes `to be unique`, `to be between`, `to be in set`, `to be in type list` plus 3 extra expectations. Each expectation lives in its own cell and prints the expectation result (which should be `success: true` when run against the provided cleaned dataset.

**How to run:**
1. Install Great Expectations if not already installed: `pip install great_expectations`
2. Place `data/P2M3_jyotis_sugata_data_clean.csv` in the working directory (or adjust the path in the notebook).
3. Run the notebook cells sequentially.

In [1]:
!pip install great_expectations



In [2]:
# Imports and load cleaned CSV
import pandas as pd
import great_expectations as ge

DATA_PATH = "data/P2M3_jyotis_sugata_data_clean.csv"  # adjust path if needed

df = pd.read_csv(DATA_PATH)
print(f"Loaded dataframe with shape: {df.shape}")
df.head()

Loaded dataframe with shape: (1436, 39)


Unnamed: 0,id,model,price,age_08_04,mfg_month,mfg_year,km,fuel_type,hp,met_color,...,powered_windows,power_steering,radio,mistlamps,sport_model,backseat_divider,metallic_rim,radio_cassette,parking_assistant,tow_bar
0,1,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13500,23,10,2002,46986,Diesel,90,1,...,1,1,0,0,0,1,0,0,0,0
1,2,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13750,23,10,2002,72937,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
2,3,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,13950,24,9,2002,41711,Diesel,90,1,...,0,1,0,0,0,1,0,0,0,0
3,4,TOYOTA Corolla 2.0 D4D HATCHB TERRA 2/3-Doors,14950,26,7,2002,48000,Diesel,90,0,...,0,1,0,0,0,1,0,0,0,0
4,5,TOYOTA Corolla 2.0 D4D HATCHB SOL 2/3-Doors,13750,30,3,2002,38500,Diesel,90,0,...,1,1,0,1,0,1,0,0,0,0


In [3]:
# Create a Great Expectations dataset wrapper (PandasDataset)
ge_df = ge.from_pandas(df)
print('Columns:', list(ge_df.columns))

Columns: ['id', 'model', 'price', 'age_08_04', 'mfg_month', 'mfg_year', 'km', 'fuel_type', 'hp', 'met_color', 'color', 'automatic', 'cc', 'doors', 'cylinders', 'gears', 'quarterly_tax', 'weight', 'mfr_guarantee', 'bovag_guarantee', 'guarantee_period', 'abs', 'airbag_1', 'airbag_2', 'airco', 'automatic_airco', 'boardcomputer', 'cd_player', 'central_lock', 'powered_windows', 'power_steering', 'radio', 'mistlamps', 'sport_model', 'backseat_divider', 'metallic_rim', 'radio_cassette', 'parking_assistant', 'tow_bar']


In [12]:
ge_df.describe()

Unnamed: 0,id,price,age_08_04,mfg_month,mfg_year,km,hp,met_color,automatic,cc,...,powered_windows,power_steering,radio,mistlamps,sport_model,backseat_divider,metallic_rim,radio_cassette,parking_assistant,tow_bar
count,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,...,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0,1436.0
mean,721.555014,10730.824513,55.947075,5.548747,1999.625348,68533.259749,101.502089,0.674791,0.05571,1576.85585,...,0.561978,0.977716,0.14624,0.256964,0.300139,0.770195,0.204735,0.145543,0.002786,0.277855
std,416.47689,3626.964585,18.599988,3.354085,1.540722,37506.448872,14.98108,0.468616,0.229441,424.38677,...,0.496317,0.147657,0.353469,0.437111,0.458478,0.420854,0.403649,0.35277,0.052723,0.448098
min,1.0,4350.0,1.0,1.0,1998.0,1.0,69.0,0.0,0.0,1300.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,361.75,8450.0,44.0,3.0,1998.0,43000.0,90.0,0.0,0.0,1400.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,721.5,9900.0,61.0,5.0,1999.0,63389.5,110.0,1.0,0.0,1600.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
75%,1081.25,11950.0,70.0,8.0,2001.0,87020.75,110.0,1.0,0.0,1600.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,1.0
max,1442.0,32500.0,80.0,12.0,2004.0,243000.0,192.0,1.0,1.0,16000.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [4]:
# Expectation 1: expect_column_values_to_be_unique
if 'id' in ge_df.columns:
    col_unique = 'id'
else:
    cols = list(ge_df.columns)[:2]
    ge_df['unique_id'] = ge_df[cols].astype(str).agg('_'.join, axis=1)
    col_unique = 'unique_id'

res1 = ge_df.expect_column_values_to_be_unique(col_unique)
print(res1)
res1

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "kwargs": {
      "column": "id",
      "result_format": "BASIC"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1436,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


{
  "success": true,
  "result": {
    "element_count": 1436,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [None]:
# Expectation 2: expect_column_values_to_be_between
num_cols = ge_df.select_dtypes(include=['number']).columns.tolist()
if len(num_cols) == 0:
    raise ValueError('No numeric columns available for between expectation')
col_between = 'mfg_month' if 'mfg_month' in num_cols else num_cols[0]
min_val = 1
max_val = 12
print(f"Applying between expectation on column '{col_between}' with min={min_val} and max={max_val}")
res2 = ge_df.expect_column_values_to_be_between(col_between, min_value=min_val, max_value=max_val)
print(res2)
res2

Applying between expectation on column 'price' with min=4350 and max=32500
{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "price",
      "min_value": 4350,
      "max_value": 32500,
      "result_format": "BASIC"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1436,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


{
  "success": true,
  "result": {
    "element_count": 1436,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [7]:
# Expectation 3: expect_column_values_to_be_in_set
cat_cols = ge_df.select_dtypes(include=['object']).columns.tolist()
if len(cat_cols) == 0:
    raise ValueError('No categorical columns available for in_set expectation')
col_in_set = 'fuel_type' if 'fuel_type' in cat_cols else cat_cols[0]
allowed_set = sorted(ge_df[col_in_set].dropna().unique().tolist())
print(f"Applying in_set expectation on '{col_in_set}' with allowed set: {allowed_set}")
res3 = ge_df.expect_column_values_to_be_in_set(col_in_set, value_set=allowed_set)
print(res3)
res3

Applying in_set expectation on 'fuel_type' with allowed set: ['CNG', 'Diesel', 'Petrol']
{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_in_set",
    "kwargs": {
      "column": "fuel_type",
      "value_set": [
        "CNG",
        "Diesel",
        "Petrol"
      ],
      "result_format": "BASIC"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1436,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


{
  "success": true,
  "result": {
    "element_count": 1436,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 4: expect_column_values_to_be_in_type_list
col_type = 'hp' if 'hp' in ge_df.columns else num_cols[0]
dtype = str(ge_df[col_type].dtype)
type_list = ['int64', 'float64', 'Int64']
print(f"Applying type_list expectation on '{col_type}' (dtype={dtype}). Allowed types: {type_list}")
res4 = ge_df.expect_column_values_to_be_in_type_list(col_type, type_list=type_list)
print(res4)
res4

Applying type_list expectation on 'hp' (dtype=int64). Allowed types: ['int64', 'float64', 'Int64']
{
  "success": true,
  "expectation_config": {
    "expectation_type": "_expect_column_values_to_be_in_type_list__aggregate",
    "kwargs": {
      "column": "hp",
      "type_list": [
        "int64",
        "float64",
        "Int64"
      ],
      "result_format": "BASIC"
    },
    "meta": {}
  },
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


{
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [9]:
# Extra Expectation A: expect_table_row_count_to_be_between
row_count = len(ge_df)
res5 = ge_df.expect_table_row_count_to_be_between(min_value=1, max_value=row_count)
print(res5)
res5

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_table_row_count_to_be_between",
    "kwargs": {
      "min_value": 1,
      "max_value": 1436,
      "result_format": "BASIC"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 1436
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


{
  "success": true,
  "result": {
    "observed_value": 1436
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [10]:
# Extra Expectation B: expect_column_mean_to_be_between
col_mean = col_between
mean_val = float(ge_df[col_mean].mean())
res6 = ge_df.expect_column_mean_to_be_between(col_mean, min_value=mean_val - 1e-9, max_value=mean_val + 1e-9)
print('mean:', mean_val)
print(res6)
res6

mean: 10730.824512534818
{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_mean_to_be_between",
    "kwargs": {
      "column": "price",
      "min_value": 10730.824512533818,
      "max_value": 10730.824512535819,
      "result_format": "BASIC"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 10730.824512534818,
    "element_count": 1436,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


{
  "success": true,
  "result": {
    "observed_value": 10730.824512534818,
    "element_count": 1436,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Extra Expectation C: expect_column_proportion_of_unique_values_to_be_between
col_prop = col_in_set
prop = ge_df[col_prop].nunique(dropna=True) / len(ge_df)
res7 = ge_df.expect_column_proportion_of_unique_values_to_be_between(col_prop, min_value=prop, max_value=prop)
print('proportion unique:', prop)
print(res7)
res7

proportion unique: 0.0020891364902506965
{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_proportion_of_unique_values_to_be_between",
    "kwargs": {
      "column": "fuel_type",
      "min_value": 0.0020891364902506965,
      "max_value": 0.0020891364902506965,
      "result_format": "BASIC"
    },
    "meta": {}
  },
  "result": {
    "observed_value": 0.0020891364902506965,
    "element_count": 1436,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}


{
  "success": true,
  "result": {
    "observed_value": 0.0020891364902506965,
    "element_count": 1436,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

## Notes

- Each expectation cell prints the result (a dictionary) with a key `success` expected to be `True` when run on the cleaned dataset.
- If any expectation fails, inspect the printed result for details and adjust either the expectation parameters or perform additional cleaning.
