In [None]:
# NAME KHUSHI KAMBLE
# ROLL NO 391022
# ASSIGNMENT 4
#  Implement a custom iterator to process the [Mall Customer Segmentation
#dataset]( https://www.kaggle.com/vjchoudhary7/customer-segmentation-tutorial-in-
#python). The iterator should stream data in chunks and allow for incremental analysis.

#Requirements
#Create a `ChunkIterator` class to iterate over chunks of the dataset.
# Implement a method to calculate and return basic statistics for each chunk.

In [None]:
import pandas as pd

class ChunkIterator:
    def __init__(self, file_path: str, chunk_size: int = 100):
        # Initialize file path and chunk size
        self.file_path = file_path
        self.chunk_size = chunk_size
        # Create an iterator from the CSV file using pandas
        self.iterator = pd.read_csv(self.file_path, chunksize=self.chunk_size)

    def __iter__(self):
        return self

    def __next__(self) -> pd.DataFrame:
        # Try to retrieve the next chunk
        try:
            chunk = next(self.iterator)
        except StopIteration:
            # Raise StopIteration when no more chunks are available
            raise
        return chunk

    def calculate_statistics(self, chunk: pd.DataFrame) -> dict:
        # Initialize a dictionary to hold statistics
        statistics = {}
        # Loop through each numeric column to calculate stats
        for column in chunk.select_dtypes(include=['float64', 'int64']).columns:
            statistics[column] = {
                "mean": chunk[column].mean(),
                "median": chunk[column].median(),
                "std_dev": chunk[column].std(),
                "min": chunk[column].min(),
                "max": chunk[column].max()
            }
        return statistics


In [None]:
# Specify the path to the dataset (ensure the dataset is in the same directory or provide the absolute path)
file_path = "Mall_Customers.csv"

# Initialize the ChunkIterator with the specified file path and chunk size
chunk_iterator = ChunkIterator(file_path=file_path, chunk_size=50)

# Iterate through each chunk in the dataset and calculate statistics
for chunk in chunk_iterator:
    # Calculate statistics for the current chunk
    stats = chunk_iterator.calculate_statistics(chunk)
    print("Statistics for current chunk:")
    # Print statistics for each column
    for col, stat in stats.items():
        print(f"{col}: {stat}")
    print("\n--- End of Chunk ---\n")


Chunk statistics:
CustomerID: {'mean': 25.5, 'median': 25.5, 'std_dev': 14.577379737113251, 'min': 1, 'max': 50}
Age: {'mean': 35.28, 'median': 31.0, 'std_dev': 13.751497135562651, 'min': 18, 'max': 67}
Annual Income (k$): {'mean': 27.4, 'median': 28.0, 'std_dev': 8.369039151929606, 'min': 15, 'max': 40}
Spending Score (1-100): {'mean': 49.48, 'median': 44.5, 'std_dev': 30.217740430659173, 'min': 3, 'max': 99}

---

Chunk statistics:
CustomerID: {'mean': 75.5, 'median': 75.5, 'std_dev': 14.577379737113251, 'min': 51, 'max': 100}
Age: {'mean': 44.22, 'median': 47.0, 'std_dev': 16.232104746144163, 'min': 18, 'max': 70}
Annual Income (k$): {'mean': 51.72, 'median': 54.0, 'std_dev': 6.023897985422355, 'min': 42, 'max': 61}
Spending Score (1-100): {'mean': 50.38, 'median': 50.5, 'std_dev': 5.8340660375601034, 'min': 40, 'max': 60}

---

Chunk statistics:
CustomerID: {'mean': 125.5, 'median': 125.5, 'std_dev': 14.577379737113251, 'min': 101, 'max': 150}
Age: {'mean': 38.58, 'median': 36.5, '