# Data Analysis Notebook

This notebook is designed to handle and analyze the large datasets found in this directory (`data1.csv` and `data2.csv`).

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

# Set plot style
sns.set(style="whitegrid")

## Define File Paths

In [None]:
DATA_DIR = "."
FILE_1 = os.path.join(DATA_DIR, "data1.csv", "data1.csv") # Assuming the file is inside the directory of the same name based on previous exploration
FILE_2 = os.path.join(DATA_DIR, "data2.csv", "data2.csv")

print(f"Checking files:\n{FILE_1}: {os.path.exists(FILE_1)}\n{FILE_2}: {os.path.exists(FILE_2)}")

## Load Data Sample
Since the files are very large (~10GB), we will first load a small sample to inspect the structure.

In [None]:
def load_sample(file_path, n_rows=1000):
    try:
        return pd.read_csv(file_path, nrows=n_rows)
    except Exception as e:
        print(f"Error loading {file_path}: {e}")
        return None

df1_sample = load_sample(FILE_1)
df2_sample = load_sample(FILE_2)

if df1_sample is not None:
    print("Data 1 Sample:")
    display(df1_sample.head())
    print(df1_sample.info())

if df2_sample is not None:
    print("\nData 2 Sample:")
    display(df2_sample.head())
    print(df2_sample.info())

## Process Data in Chunks
To analyze the full dataset, we can iterate through it in chunks.

In [None]:
chunk_size = 100000

def process_in_chunks(file_path, chunk_size=100000):
    print(f"Processing {file_path} in chunks of {chunk_size}...")
    # Example: Count total rows or compute a simple statistic
    total_rows = 0
    for chunk in pd.read_csv(file_path, chunksize=chunk_size):
        # Perform your analysis here on 'chunk'
        total_rows += len(chunk)
        # break # Remove this break to process the whole file
    print(f"Total rows processed: {total_rows}")

# Uncomment to run (this might take a while for 10GB files)
# process_in_chunks(FILE_1)

## Analyze Time Interval
The `TimeInterval` column contains Unix timestamps in milliseconds. Let's convert them to readable dates and verify the interval size.

In [None]:
# Convert TimeInterval to datetime
if df1_sample is not None:
    df1_sample['datetime'] = pd.to_datetime(df1_sample['TimeInterval'], unit='ms')
    print("Converted Datetimes (First 5):")
    display(df1_sample[['TimeInterval', 'datetime']].head())
    
    # Calculate interval difference
    unique_times = sorted(df1_sample['TimeInterval'].unique())
    if len(unique_times) > 1:
        diff = unique_times[1] - unique_times[0]
        print(f"\nTime difference between steps: {diff} ms ({diff/1000/60} minutes)")