In [1]:
## DESCRIBE DATASET (FIRST 10 ENTRIES)

import pandas as pd

# Load Parquet file
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\raw_master_dataset_all_varieties.parquet"
df = pd.read_parquet(file_path)

# View first 5 rows
print(df.head())

# View summary info
print(df.info())

# Quick stats
print(df.describe())


      State Name District Name                                Market Name  \
0         Punjab       Fazilka                                     Abohar   
1  Uttar Pradesh          Agra                                    Achnera   
2         Punjab     Jalandhar                                    Adampur   
3        Gujarat     Ahmedabad  Ahmedabad(Chimanbhai Patal Market Vasana)   
4        Gujarat     Ahmedabad  Ahmedabad(Chimanbhai Patal Market Vasana)   

  Variety       Group  Arrivals (Tonnes)  Min Price (Rs./Quintal)  \
0   Onion  Vegetables               15.0                   2500.0   
1     Red  Vegetables                0.8                   3200.0   
2   Other  Vegetables                1.7                   2900.0   
3   Local  Vegetables              496.6                   2500.0   
4   Nasik  Vegetables              496.6                   3000.0   

   Max Price (Rs./Quintal)  Modal Price (Rs./Quintal) Reported Date Grade  \
0                   3200.0                   

In [10]:
import pandas as pd

# === 1️⃣ Load dataset (original untouched) ===
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_parquet\master_dataset_distinct_continuous_flagged.parquet"
df_original = pd.read_parquet(file_path)
df_original['date'] = pd.to_datetime(df_original['date'])

# === 2️⃣ Filter out imputed prices/quantities (new dataset) ===
df_filtered = df_original[
    (df_original['price_missing_flag'] == 0) &
    (df_original['quantity_missing_flag'] == 0)
].copy()

# === 3️⃣ Pivot to daily M×N matrices (new datasets) ===
price_df = df_filtered.pivot(index='date', columns='mandi', values='modal_price').copy()
arrival_df = df_filtered.pivot(index='date', columns='mandi', values='quantity').copy()

price_df.sort_index(inplace=True)
arrival_df.sort_index(inplace=True)

# === 4️⃣ Save pivoted datasets as new Parquet files ===
price_output_file = "price_matrix.parquet"
arrival_output_file = "arrival_matrix.parquet"

price_df.to_parquet(price_output_file, index=True)
arrival_df.to_parquet(arrival_output_file, index=True)

print(f"✅ Price matrix saved to {price_output_file}")
print(f"✅ Arrival matrix saved to {arrival_output_file}")


✅ Price matrix saved to price_matrix.parquet
✅ Arrival matrix saved to arrival_matrix.parquet


In [None]:
## SORT DATASET 

import pandas as pd

# Load the Parquet file
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset.parquet"
df = pd.read_parquet(file_path)

# Ensure 'date' column is datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Sort by date, state, district, mandi
df.sort_values(by=['state', 'district', 'mandi', 'date'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Save sorted file back (Parquet or Excel)
df.to_parquet(file_path, index=False)  # overwrite the original
# Or save as Excel (beware Excel row limits)
# df.to_excel(file_path.replace('.parquet', '_sorted.xlsx'), index=False)

print("✅ Dataset sorted by date, state, district, mandi.")
print(df.head())


In [None]:
## SORT DATASET BY DATE, STATE, DISTRICT, MANDI
## ADD NEW UNIQUE COLUMN IN 0TH POSITION
## REMOVE SOURCE COLUMN IF IT EXISTS
## CONVERT DATE TO DATETIME

import pandas as pd

# Load merged Parquet file
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset.parquet"
df = pd.read_parquet(file_path)

# Remove 'source' column if it exists
if 'source' in df.columns:
    df.drop(columns=['source'], inplace=True)

# Ensure 'date' is datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Extract year from date
df['year'] = df['date'].dt.year.fillna(0).astype(int)  # fallback to 0 if date is missing

# Add a unique ID column as YEAR_000001, YEAR_000002, …
df.insert(0, 'unique_id', [f"{year}_{i:06d}" for year, i in zip(df['year'], range(1, len(df) + 1))])

# Drop temporary 'year' column
df.drop(columns=['year'], inplace=True)

# Sort by date, state, district, mandi
df.sort_values(by=['date', 'state', 'district', 'mandi'], inplace=True)
df.reset_index(drop=True, inplace=True)

# Save back to Parquet
df.to_parquet(file_path, index=False)

print("✅ Unique ID added with year, 'source' column removed, dataset sorted and saved.")
print(df.head())


✅ Unique ID added with year, 'source' column removed, dataset sorted and saved.
     unique_id              state    district                   mandi  \
0  2018_000001  Arunachal Pradesh  Papum Pore              Naharlagun   
1  2018_000002              Assam     Barpeta                   Howly   
2  2018_000003              Assam      Dhubri                Gauripur   
3  2018_000004              Assam   Dibrugarh               Dibrugarh   
4  2018_000005              Assam      Kamrup  P.O. Uparhali Guwahati   

   modal_price  quantity       date  
0       6000.0    5750.0 2018-01-01  
1       4200.0    4100.0 2018-01-01  
2       4300.0    4000.0 2018-01-01  
3       4000.0    3600.0 2018-01-01  
4       4100.0    4050.0 2018-01-01  


In [25]:
## SEARCH FOR DUPLICATE ROWS

import pandas as pd

# Load the Parquet file
df = pd.read_parquet(r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\national_price_daily_wo_fill.parquet")

# Count duplicates per mandi-date pair
duplicate_counts = df.duplicated(subset=['mandi', 'date'], keep=False)

# Total number of duplicate rows
total_duplicates = duplicate_counts.sum()

print("Total duplicate mandi-date rows:", total_duplicates)


KeyError: Index(['mandi'], dtype='object')

In [15]:
## SERIES 1 - FIND MISSING DATES FOR EACH MANDI
## SERIES 1.1

## CONVERT DATE TO DATETIME
## REMOVE ROWS WITH INVALID OR MISSING MANDI

import pandas as pd

# Load Parquet file
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_continuous_flagged.parquet"
df = pd.read_parquet(file_path)

# Ensure date is datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Remove rows with invalid or missing mandi
df = df[df['mandi'].notna()]


In [16]:
## SERIES 1.2

# Full date range
full_dates = pd.date_range(start='2018-01-01', end='2024-12-31', freq='D')

In [17]:
## SERIES 1.3

# Get unique mandis
mandis = df['mandi'].unique()

# Create a DataFrame with all combinations of mandi and full date range
all_combinations = pd.MultiIndex.from_product([mandis, full_dates], names=['mandi', 'date']).to_frame(index=False)


In [18]:
## SERIES 1.4

# Merge with existing data to find which combinations are missing
merged = pd.merge(all_combinations, df[['mandi', 'date']], on=['mandi', 'date'], how='left', indicator=True)

# Rows missing in the dataset
missing_dates = merged[merged['_merge'] == 'left_only']

print(f"Total missing mandi-date entries: {len(missing_dates)}")


Total missing mandi-date entries: 0


In [20]:
## SERIES 1.5

missing_per_mandi = missing_dates.groupby('mandi').size().sort_values(ascending=False)
print("\nMissing dates per mandi:")
print(missing_per_mandi)



Missing dates per mandi:
Series([], dtype: int64)


In [30]:
import pandas as pd

# Load merged Parquet file
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset.parquet"
df = pd.read_parquet(file_path)

# Ensure 'date' is datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Remove rows with missing mandi
df = df[df['mandi'].notna()]

# Columns to keep for filling
cols_to_fill = ['modal_price', 'quantity']  # rename to your actual column names if different

# Result list
all_mandis_df = []

# Full date range
full_dates = pd.date_range(start='2018-01-01', end='2024-12-31', freq='D')

# Process each mandi separately
for mandi in df['mandi'].unique():
    mandi_df = df[df['mandi'] == mandi].copy()
    
    # Reindex by full date range
    mandi_df = mandi_df.set_index('date').reindex(full_dates)
    
    # Keep mandi info
    mandi_df['mandi'] = mandi
    if 'state' in df.columns:
        mandi_df['state'] = mandi_df['state'].ffill()  # forward-fill state
    if 'district' in df.columns:
        mandi_df['district'] = mandi_df['district'].ffill()  # forward-fill district
    
    # Fill price and arrival
    if 'modal_price' in mandi_df.columns:
        mandi_df['modal_price'] = mandi_df['modal_price'].ffill()  # forward-fill price
    if 'quantity' in mandi_df.columns:
        mandi_df['quantity'] = mandi_df['quantity'].interpolate(method='linear').fillna(0)  # interpolate arrival, fill remaining 0
    
    # Reset index to have date as column
    mandi_df.reset_index(inplace=True)
    mandi_df.rename(columns={'index': 'date'}, inplace=True)
    
    # Append to list
    all_mandis_df.append(mandi_df)

# Concatenate all mandis
continuous_df = pd.concat(all_mandis_df, ignore_index=True)

# Sort by date, state, district, mandi
sort_cols = [c for c in ['date', 'state', 'district', 'mandi'] if c in continuous_df.columns]
continuous_df.sort_values(by=sort_cols, inplace=True)
continuous_df.reset_index(drop=True, inplace=True)

# Save to Parquet
output_file = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_continuous.parquet"
continuous_df.to_parquet(output_file, index=False)

print("✅ Continuous time series created and saved.")
print(continuous_df.head())


ValueError: cannot reindex on an axis with duplicate labels

In [20]:
import pandas as pd

# Load merged Parquet file
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset.parquet"
df = pd.read_parquet(file_path)

# Ensure 'date' is datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Remove rows with missing mandi
df = df[df['mandi'].notna()]

# Full date range
full_dates = pd.date_range(start='2018-01-01', end='2024-12-31', freq='D')

all_mandis_df = []

# Process each mandi separately
for mandi in df['mandi'].unique():
    mandi_df = df[df['mandi'] == mandi].copy()
    
    # Aggregate duplicates by date (mean for numeric, first for strings)
    agg_dict = {col: 'first' for col in ['state', 'district']}  # adjust if needed
    for col in ['modal_price', 'quantity']:
        if col in mandi_df.columns:
            agg_dict[col] = 'mean'
    
    mandi_df = mandi_df.groupby('date', as_index=False).agg(agg_dict)
    
    # Reindex by full date range
    mandi_df = mandi_df.set_index('date').reindex(full_dates)
    
    # Fill mandi info
    mandi_df['mandi'] = mandi
    if 'state' in mandi_df.columns:
        mandi_df['state'] = mandi_df['state'].ffill()
    if 'district' in mandi_df.columns:
        mandi_df['district'] = mandi_df['district'].ffill()
    
    # Fill numeric columns
    if 'modal_price' in mandi_df.columns:
        mandi_df['modal_price'] = mandi_df['modal_price'].ffill()
    if 'quantity' in mandi_df.columns:
        mandi_df['quantity'] = mandi_df['quantity'].interpolate(method='linear').fillna(0)
    
    # Reset index
    mandi_df.reset_index(inplace=True)
    mandi_df.rename(columns={'index': 'date'}, inplace=True)
    
    all_mandis_df.append(mandi_df)

# Concatenate all mandis
continuous_df = pd.concat(all_mandis_df, ignore_index=True)

# Sort
sort_cols = [c for c in ['date', 'state', 'district', 'mandi'] if c in continuous_df.columns]
continuous_df.sort_values(by=sort_cols, inplace=True)
continuous_df.reset_index(drop=True, inplace=True)

# Save
output_file = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_continuous.parquet"
continuous_df.to_parquet(output_file, index=False)

print("✅ Continuous time series created and saved.")
print(continuous_df.head())


  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df['district'].ffill()
  mandi_df['state'] = mandi_df['state'].ffill()
  mandi_df['district'] = mandi_df[

✅ Continuous time series created and saved.
        date              state    district  modal_price  quantity  \
0 2018-01-01  Arunachal Pradesh  Papum Pore       6000.0    5750.0   
1 2018-01-01              Assam     Barpeta       4200.0    4100.0   
2 2018-01-01              Assam      Dhubri       4300.0    4000.0   
3 2018-01-01              Assam   Dibrugarh       4000.0    3600.0   
4 2018-01-01              Assam      Kamrup       4100.0    4050.0   

                    mandi  
0              Naharlagun  
1                   Howly  
2                Gauripur  
3               Dibrugarh  
4  P.O. Uparhali Guwahati  


In [28]:
import pandas as pd

# Load the Parquet file
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset.parquet"
df = pd.read_parquet(file_path)

# Check original shape
print("Original shape:", df.shape)

# Aggregate duplicates by mandi and date, taking the mean of modal_price and quantity
df_agg = df.groupby(['mandi', 'date'], as_index=False).agg({
    'modal_price': 'mean',  # take mean of modal_price
    'quantity': 'mean',     # take mean of quantity
    'state': 'first',       # keep state info
    'district': 'first',    # keep district info
    'unique_id': 'first'    # keep one unique_id
})

# Check new shape
print("Shape after aggregation:", df_agg.shape)

# Optional: sort by mandi and date
df_agg = df_agg.sort_values(by=['mandi', 'date']).reset_index(drop=True)

# Save the aggregated data to a new Parquet file
output_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_aggregated.parquet"
df_agg.to_parquet(output_path, index=False)

print(f"Aggregated file saved as '{output_path}'")


Original shape: (1457550, 7)
Shape after aggregation: (1374963, 7)
Aggregated file saved as 'C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_aggregated.parquet'


In [32]:
# Strip leading/trailing spaces and lowercase strings for consistency
df['mandi'] = df['mandi'].str.strip().str.lower()
df['state'] = df['state'].str.strip().str.lower()
df['district'] = df['district'].str.strip().str.lower()


In [33]:
df_agg = df.groupby(['mandi', 'date'], as_index=False).agg({
    'modal_price': 'mean',
    'quantity': 'mean',
    'state': 'first',
    'district': 'first',
    'unique_id': 'first'
})


In [34]:
# Count any remaining mandi-date duplicates
remaining_duplicates = df_agg.duplicated(subset=['mandi', 'date']).sum()
print("Remaining mandi-date duplicates:", remaining_duplicates)


Remaining mandi-date duplicates: 0


In [35]:
output_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_aggregated_clean.parquet"
df_agg.to_parquet(output_path, index=False)
print(f"Cleaned and aggregated file saved as '{output_path}'")


Cleaned and aggregated file saved as 'C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_aggregated_clean.parquet'


In [12]:
## FLAGGED CONTINUOUS 
## 7 DAYS FFILL
## 2 DAYS BFILL

import pandas as pd

# === Load merged Parquet file ===
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_distinct.parquet"
df = pd.read_parquet(file_path)

# Ensure 'date' is datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Remove rows with missing mandi
df = df[df['mandi'].notna()]

# Full date range for all mandis
full_dates = pd.date_range(start='2018-01-01', end='2024-12-31', freq='D')

# Container for all processed mandis
all_mandis_df = []

# === Process each mandi separately ===
for mandi in df['mandi'].unique():
    mandi_df = df[df['mandi'] == mandi].copy()
    mandi_df = mandi_df.set_index('date').sort_index()

    # Reindex with full date range
    mandi_df = mandi_df.reindex(full_dates)

    # Restore mandi info
    mandi_df['mandi'] = mandi
    if 'state' in df.columns:
        mandi_df['state'] = mandi_df['state'].ffill()
    if 'district' in df.columns:
        mandi_df['district'] = mandi_df['district'].ffill()

    # === Price Imputation ===
    if 'modal_price' in mandi_df.columns:
        # Keep flag for missing values before fill
        mandi_df['price_missing_flag'] = mandi_df['modal_price'].isna().astype(int)

        # Forward-fill up to 7 days, then stop
        mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
            method='ffill', limit=7
        )

        # Optional: backfill short early gaps
        mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
            method='bfill', limit=2
        )

    # === Quantity Imputation ===
    if 'quantity' in mandi_df.columns:
        mandi_df['quantity_missing_flag'] = mandi_df['quantity'].isna().astype(int)

        # Interpolate linearly (for short gaps)
        mandi_df['quantity'] = mandi_df['quantity'].interpolate(method='linear', limit=7)
        mandi_df['quantity'] = mandi_df['quantity'].fillna(0)  # long gaps -> 0

    # Reset index to have 'date' column again
    mandi_df.reset_index(inplace=True)
    mandi_df.rename(columns={'index': 'date'}, inplace=True)

    all_mandis_df.append(mandi_df)

# === Combine all mandis ===
continuous_df = pd.concat(all_mandis_df, ignore_index=True)

# Sort cleanly
sort_cols = [c for c in ['date', 'state', 'district', 'mandi'] if c in continuous_df.columns]
continuous_df.sort_values(by=sort_cols, inplace=True)
continuous_df.reset_index(drop=True, inplace=True)

# === Save final continuous dataset ===
output_file = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_continuous_flagged.parquet"
continuous_df.to_parquet(output_file, index=False)

print("✅ Continuous time series (with flags & limited fill) created and saved.")
print(continuous_df.head())


  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['modal_price'].fillna(
  mandi_df['modal_price'] = mandi_df['mo

✅ Continuous time series (with flags & limited fill) created and saved.
        date                   mandi  modal_price  quantity  \
0 2018-01-01              Naharlagun       6000.0    5750.0   
1 2018-01-01                   Howly       4200.0    4100.0   
2 2018-01-01                Gauripur       4300.0    4000.0   
3 2018-01-01               Dibrugarh       4000.0    3600.0   
4 2018-01-01  P.O. Uparhali Guwahati       4100.0    4050.0   

               state    district    unique_id  price_missing_flag  \
0  Arunachal Pradesh  Papum Pore  2018_000001                   0   
1              Assam     Barpeta  2018_000002                   0   
2              Assam      Dhubri  2018_000003                   0   
3              Assam   Dibrugarh  2018_000004                   0   
4              Assam      Kamrup  2018_000005                   0   

   quantity_missing_flag  
0                      0  
1                      0  
2                      0  
3                      0  

In [None]:
## DAILY NATIONAL PRICE INDEX WITH FILL

import pandas as pd

# === Load your continuous dataset ===
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_distinct_continuous_flagged.parquet"
df = pd.read_parquet(file_path)

# Ensure proper types
df['date'] = pd.to_datetime(df['date'], errors='coerce')
df = df[df['modal_price'].notna() & df['quantity'].notna()]

# === Compute the daily weighted national price index ===
# Formula: national_price_index = Σ(price_i × arrival_i) / Σ(arrival_i)
daily_index = (
    df.groupby('date')
      .apply(lambda x: (x['modal_price'] * x['quantity']).sum() / x['quantity'].sum())
      .reset_index(name='national_price_index')
)

# === Save the result ===
output_file = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\national_price_index_daily.parquet"
daily_index.to_parquet(output_file, index=False)

print("✅ Daily weighted national price index created and saved.")
print(daily_index.head())


  .apply(lambda x: (x['modal_price'] * x['quantity']).sum() / x['quantity'].sum())
  .apply(lambda x: (x['modal_price'] * x['quantity']).sum() / x['quantity'].sum())


✅ Daily weighted national price index created and saved.
        date  national_price_index
0 2018-01-01           3685.328059
1 2018-01-02           3719.947232
2 2018-01-03           3786.077593
3 2018-01-04           3830.038413
4 2018-01-05           3915.649068


In [24]:
## DAILY NATIONAL PRICE INDEX WITHOUT FILL

import pandas as pd

# === Load the continuous flagged dataset ===
file_path = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\master_dataset_distinct_continuous_flagged.parquet"
df = pd.read_parquet(file_path)

# Ensure 'date' is datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Remove rows with missing prices or quantities
df = df[df['modal_price'].notna() & df['quantity'].notna()]

# === Exclude imputed prices ===
df = df[df['price_missing_flag'] == 0]

# === Compute daily weighted national price index ===
# Formula: national_price_index = Σ(price_i × arrival_i) / Σ(arrival_i)
daily_index = (
    df.groupby('date')
      .apply(lambda x: (x['modal_price'] * x['quantity']).sum() / x['quantity'].sum())
      .reset_index(name='national_price_index')
)

# === Save the result ===
output_file = r"C:\Users\PESU-RF\Downloads\Onions\Onions\processed_data\national_price_daily_wo_fill.parquet"
daily_index.to_parquet(output_file, index=False)

print("✅ Daily weighted national price index (excluding imputed prices) created and saved.")
print(daily_index.head())


✅ Daily weighted national price index (excluding imputed prices) created and saved.
        date  national_price_index
0 2018-01-01           3685.328059
1 2018-01-02           3600.185105
2 2018-01-03           3792.483300
3 2018-01-04           3837.074650
4 2018-01-05           3909.109786


  .apply(lambda x: (x['modal_price'] * x['quantity']).sum() / x['quantity'].sum())
