<a href="https://colab.research.google.com/github/iramishra1/VC/blob/main/week6.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# Install dependencies
!pip install pandas dask modin[all] pyyaml pandera fastparquet

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import libraries
import os
import pandas as pd
import time
import yaml
import pandera as pa
from pandera import Column, DataFrameSchema
import ray
import modin.config as modin_cfg
import dask.dataframe as dd
import modin.pandas as mpd

# Create the directory if it doesn't exist
folder_path = '/content/drive/MyDrive/datasets'
os.makedirs(folder_path, exist_ok=True)

# Create an e-commerce orders dataset
ecommerce_data = {
    'order_id': [1001, 1002, 1003],
    'customer_id': [501, 502, 501],
    'product': ['Phone Case', 'Wireless Mouse', 'Monitor'],
    'price': [12.99, 25.00, 120.00],
    'quantity': [2, 1, 1],
    'order_date': ['2024-11-01', '2024-11-02', '2024-11-02']
}

ecommerce_df = pd.DataFrame(ecommerce_data)

# Save to a CSV file
file_path = os.path.join(folder_path, 'ecommerce_orders.csv')
ecommerce_df.to_csv(file_path, index=False)

# Setup Modin with Ray
modin_cfg.Engine.put("ray")
ray.init(ignore_reinit_error=True)

# Timing: Pandas
start = time.time()
print("Reading with pandas...")
pandas_df = pd.read_csv(file_path)
print(f"Pandas done in {time.time() - start:.2f} seconds")

# Timing: Dask
start = time.time()
print("Reading with dask...")
dask_df = dd.read_csv(file_path)
dask_df_head = dask_df.head()  # Trigger compute
print(f"Dask done in {time.time() - start:.2f} seconds")

# Timing: Modin
start = time.time()
print("Reading with modin...")
modin_df = mpd.read_csv(file_path)
print(f"Modin done in {time.time() - start:.2f} seconds")

# Example: Validate columns (change according to your file)
schema = DataFrameSchema({
    "order_id": Column(int),
    "customer_id": Column(int),
    "product": Column(str),
    "price": Column(float),
    "quantity": Column(int),
    "order_date": Column(str),
})

# Convert dtypes if needed
pandas_df["order_id"] = pandas_df["order_id"].astype(int)
pandas_df["price"] = pandas_df["price"].astype(float)
pandas_df["quantity"] = pandas_df["quantity"].astype(int)

# Apply schema validation
print("Validating schema...")
validated_df = schema.validate(pandas_df)
print("Schema valid.")

# Save as gzipped pipe-delimited file
output_path = "/content/drive/MyDrive/datasets/cleaned_output.psv.gz"
print("Saving gzipped pipe-separated file...")
validated_df.to_csv(output_path, sep='|', compression='gzip', index=False)
print("Saved.")

# Generate summary
print("Generating summary:")
print(validated_df.describe(include='all'))

# Optional: Save summary to a file
summary_output = "/content/drive/MyDrive/datasets/summary.txt"
with open(summary_output, "w") as f:
    f.write(str(validated_df.describe(include='all')))
print(f"Summary saved to {summary_output}")





Collecting pandera
  Downloading pandera-0.23.1-py3-none-any.whl.metadata (18 kB)
Collecting fastparquet
  Downloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.2 kB)
Collecting typing_inspect>=0.6.0 (from pandera)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting mypy-extensions>=0.3.0 (from typing_inspect>=0.6.0->pandera)
  Downloading mypy_extensions-1.0.0-py3-none-any.whl.metadata (1.1 kB)
Downloading pandera-0.23.1-py3-none-any.whl (264 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m264.2/264.2 kB[0m [31m6.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fastparquet-2024.11.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m34.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading typing_inspect-0.9.0-py3-none-any.whl (8.8 kB)
Downloading mypy_extensions-1.0.0-py3-none-any.whl (4.7 kB

2025-04-16 00:31:09,338	INFO worker.py:1684 -- Calling ray.init() again after it has already been called.


Reading with pandas...
Pandas done in 0.01 seconds
Reading with dask...
Dask done in 1.58 seconds
Reading with modin...
Modin done in 2.16 seconds
Validating schema...
Schema valid.
Saving gzipped pipe-separated file...
Saved.
Generating summary:
        order_id  customer_id     product       price  quantity  order_date
count        3.0     3.000000           3    3.000000  3.000000           3
unique       NaN          NaN           3         NaN       NaN           2
top          NaN          NaN  Phone Case         NaN       NaN  2024-11-02
freq         NaN          NaN           1         NaN       NaN           2
mean      1002.0   501.333333         NaN   52.663333  1.333333         NaN
std          1.0     0.577350         NaN   58.623630  0.577350         NaN
min       1001.0   501.000000         NaN   12.990000  1.000000         NaN
25%       1001.5   501.000000         NaN   18.995000  1.000000         NaN
50%       1002.0   501.000000         NaN   25.000000  1.000000      