In [None]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"bilalrezzoug","key":"499d927a77d3546c94e264c0e4c83dcc"}'}

In [None]:
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [None]:
!pip install kaggle
!kaggle datasets download -d kimdaegyeom/5g-traffic-datasets -p /content --unzip


Dataset URL: https://www.kaggle.com/datasets/kimdaegyeom/5g-traffic-datasets
License(s): unknown
Downloading 5g-traffic-datasets.zip to /content
 98% 3.13G/3.21G [00:40<00:02, 35.0MB/s]
100% 3.21G/3.21G [00:40<00:00, 84.3MB/s]


# ****Method 1 ‚ÄîChunksize****

In [None]:
# --- Setup ---
!pip install pandas

# --- Imports ---
import pandas as pd
import time
import psutil  # for memory usage

# --- File path ---
file_path = r"/content/5G_Traffic_Datasets/Video_Conferencing/Google_Meet/Google_Meet_1.csv"  # adjust if needed

# --- Memory before reading ---
process = psutil.Process()
mem_before = process.memory_info().rss / (1024 * 1024)  # in MB

# --- Pandas with chunksize ---
chunk_size = 100000
means = []
start = time.time()

# Try reading with a different encoding
try:
    for chunk in pd.read_csv(file_path, chunksize=chunk_size, encoding='latin-1'):
        means.append(chunk['Length'].mean())  # using 'Length' column

    pandas_mean = sum(means) / len(means)
    pandas_time = time.time() - start

    # --- Memory after reading ---
    mem_after = process.memory_info().rss / (1024 * 1024)  # in MB
    mem_used = mem_after - mem_before

    # --- Results ---
    print(f"‚úÖ Mean (Length): {pandas_mean}")
    print(f"‚è±Ô∏è Time with Pandas (chunksize): {pandas_time:.2f} seconds")
    print(f"üíæ Memory used: {mem_used:.2f} MB")

except UnicodeDecodeError:
    print("‚ùå UnicodeDecodeError: Could not decode the file with latin-1 encoding. Consider trying another encoding or inspecting the file content.")
except KeyError:
    print("‚ùå KeyError: The 'Length' column was not found in the DataFrame. Please check the column name.")

‚úÖ Mean (Length): 462.7395455907966
‚è±Ô∏è Time with Pandas (chunksize): 92.93 seconds
üíæ Memory used: 22.40 MB


# ***Method 2 ‚Äî Dask***

In [None]:
import dask.dataframe as dd
import time, psutil, os

# --- Define file path ---
file_path = r"/content/5G_Traffic_Datasets/Video_Conferencing/Google_Meet/Google_Meet_1.csv"

# --- Memory before ---
process = psutil.Process()
mem_before = process.memory_info().rss / (1024 * 1024)

# --- Start timer ---
start = time.time()

# --- Read CSV using Dask ---
# 'blocksize=None' loads file in one partition if it's small; omit for automatic chunking
df = dd.read_csv(file_path, encoding='latin-1', assume_missing=True)

# --- Compute mean of 'Length' column ---
dask_mean = df['Length'].mean().compute()

# --- Calculate time and memory ---
dask_time = time.time() - start
mem_after = process.memory_info().rss / (1024 * 1024)
dask_mem = mem_after - mem_before

# --- Print results ---
print("‚úÖ Dask finished")
print(f"Mean: {dask_mean:.4f}")
print(f"Time: {dask_time:.2f} seconds")
print(f"Memory Used: {dask_mem:.2f} MB")


‚úÖ Dask finished
Mean: 462.7372
Time: 125.72 seconds
Memory Used: 219.90 MB


# **Method 3 ‚Äî Compressed CSV (gzip)**

In [None]:
import gzip, shutil, time, psutil, os
import pandas as pd

# --- Define file paths ---
file_path = r"/content/5G_Traffic_Datasets/Video_Conferencing/Google_Meet/Google_Meet_1.csv"
compressed_file = file_path + ".gz"

# --- Compress file (only once) ---
if not os.path.exists(compressed_file):
    with open(file_path, 'rb') as f_in:
        with gzip.open(compressed_file, 'wb') as f_out:
            shutil.copyfileobj(f_in, f_out)

# --- Measure memory before ---
process = psutil.Process()
mem_before = process.memory_info().rss / (1024 * 1024)

# --- Read and process the entire compressed CSV ---
start = time.time()

# Try reading with a different encoding
try:
    df = pd.read_csv(compressed_file, compression='gzip', encoding='latin-1')
    compressed_mean = df['Length'].mean()

    compressed_time = time.time() - start
    mem_after = process.memory_info().rss / (1024 * 1024)
    compressed_mem = mem_after - mem_before

    # --- File size info ---
    original_size = os.path.getsize(file_path) / (1024 * 1024)
    compressed_size = os.path.getsize(compressed_file) / (1024 * 1024)

    # --- Print results ---
    print("‚úÖ Compressed CSV (gzip, full read) finished")
    print(f"Mean: {compressed_mean:.4f}")
    print(f"Time: {compressed_time:.2f} seconds")
    print(f"Memory Used: {compressed_mem:.2f} MB")
    print(f"Original Size: {original_size:.2f} MB ‚Üí Compressed Size: {compressed_size:.2f} MB")

except UnicodeDecodeError:
    print("‚ùå UnicodeDecodeError: Could not decode the file with latin-1 encoding either. The file might be in a different encoding or contain binary data.")
    print("Consider trying another encoding or inspecting the file content.")

‚úÖ Compressed CSV (gzip, full read) finished
Mean: 462.7372
Time: 122.74 seconds
Memory Used: 8007.27 MB
Original Size: 4526.62 MB ‚Üí Compressed Size: 420.04 MB
