In [10]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install dask

In [12]:
import pandas as pd
import numpy as np
import dask.dataframe as dd
from multiprocessing import Pool, cpu_count
from time import time

In [14]:
def preprocess_chunk(chunk):
    """Preprocess a single chunk of data containing date and temperature_2m."""
    chunk['date'] = pd.to_datetime(chunk['date'], errors='coerce')
    chunk = chunk.dropna(subset=['temperature_2m'])
    chunk['temperature_2m_normalized'] = (chunk['temperature_2m'] - chunk['temperature_2m'].mean()) / chunk['temperature_2m'].std()
    return chunk

def parallel_process_multiprocessing(file_path, chunksize):
    """Preprocess data in parallel using multiprocessing."""
    start_time = time()
    chunks = pd.read_csv(file_path, chunksize=chunksize)

    with Pool(cpu_count()) as pool:
        processed_chunks = pool.map(preprocess_chunk, chunks)

    # Concatenate all processed chunks
    result = pd.concat(processed_chunks)

    # Save the processed data to a CSV file
    result.to_csv("/content/drive/MyDrive/P&DC /Theory/project/processed_multiprocessing.csv", index=False)

    print(f"Multiprocessing completed in {time() - start_time:.2f} seconds")

def parallel_process_dask(file_path):
    """Preprocess data in parallel using Dask."""
    start_time = time()

    dask_df = dd.read_csv(file_path)
    dask_df = dask_df.map_partitions(preprocess_chunk)
    dask_df.compute().to_csv("/content/drive/MyDrive/P&DC /Theory/project/processed_dask.csv", index=False)

    print(f"Dask completed in {time() - start_time:.2f} seconds")

if __name__ == "__main__":
    input_file = "/content/drive/MyDrive/P&DC /Theory/project/data.csv"

    print("Running multiprocessing...")
    parallel_process_multiprocessing(input_file, chunksize=10000)

    print("Running Dask...")
    parallel_process_dask(input_file)


Running multiprocessing...
Multiprocessing completed in 9.15 seconds
Running Dask...
Dask completed in 6.24 seconds
