# ***Complete Guide to Generative AI for Data Analysis and Data Science***

---


### **Chapter 6:** *Sampling of Data*

a) Sampling of Data

In [2]:
import pandas as pd

# ==== CONFIGURATION ====
file_name = '/content/06_02_healthcare_dataset.csv'
num_rows = 100                # Number of rows you want to sample
output_file = 'sampled_output.csv'

# ==== READ AND SAMPLE ====
try:
    df = pd.read_csv(file_name)

    if num_rows > len(df):
        print(f"Requested {num_rows} rows, but file only has {len(df)} rows.")
    else:
        sampled_df = df.sample(n=num_rows, random_state=42)  # random_state for reproducibility
        # Save the sampled data
        sampled_df.to_csv(output_file, index=False)
        print(f"Random Sample of {num_rows} rows from '{file_name}':\n")
        print(sampled_df)

except FileNotFoundError:
    print(f"File '{file_name}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Random Sample of 100 rows from '/content/06_02_healthcare_dataset.csv':

     Type of Admission  Admission_Deposit
1501            Trauma               4052
2586            Urgent               4294
2653         Emergency               4784
1055            Trauma               6580
705             Urgent               5270
...                ...                ...
4740            Urgent               5534
2940            Trauma               4988
3456            Trauma               4500
373             Trauma               7207
79           Emergency               4385

[100 rows x 2 columns]


b) Comparing the Population and the Sample Statistics

In [3]:
import pandas as pd

# ==== CONFIGURATION ====
input_file = '/content/06_02_healthcare_dataset.csv'
num_rows = 1000                    # Number of rows to sample
output_file = 'sampled_output_updated.csv'      # Output CSV file
target_column = 'Admission_Deposit'     # Column to compute stats on

# ==== READ, SAMPLE, SAVE, AND STATS ====
try:
    df = pd.read_csv(input_file)

    if target_column not in df.columns:
        print(f"Column '{target_column}' not found in the input file.")
    elif num_rows > len(df):
        print(f"Requested {num_rows} rows, but file only has {len(df)} rows.")
    else:
        # Sample and save
        sampled_df = df.sample(n=num_rows, random_state=42)
        sampled_df.to_csv(output_file, index=False)

        # Compute stats
        source_mean = df[target_column].mean()
        source_std = df[target_column].std()
        sample_mean = sampled_df[target_column].mean()
        sample_std = sampled_df[target_column].std()

        # Display results
        print(f"Stats for '{target_column}':")
        print(f"  Source File   → Mean: {source_mean:.2f}, Std Dev: {source_std:.2f}")
        print(f"  Sampled File  → Mean: {sample_mean:.2f}, Std Dev: {sample_std:.2f}")
        print(f"Sample saved to '{output_file}'.")

except FileNotFoundError:
    print(f"File '{input_file}' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

Stats for 'Admission_Deposit':
  Source File   → Mean: 5135.85, Std Dev: 970.72
  Sampled File  → Mean: 5169.51, Std Dev: 962.73
Sample saved to 'sampled_output_updated.csv'.


**Challenge:**

*   Create a sensor dataset
*   Randomly sample 100 rows from the Sensor data
*   Save the sample to a file

[Create a dataset of sensor data. Each row should have a sensor_id, a timestamp, a temperature in celcius, relative humidity, and pressure in millibars. There should be 10 different sensors and each sensor should have 1 row per minute starting at 01:00 UTC on July 1 2025 and ending at 02:00 UTC on July 1 2025]



In [5]:
import numpy as np

# Parameters
start_time = pd.Timestamp("2025-07-01 01:00:00", tz="UTC")
end_time = pd.Timestamp("2025-07-01 02:00:00", tz="UTC")
timestamps = pd.date_range(start=start_time, end=end_time, freq="1min")

sensor_ids = [f"sensor_{i}" for i in range(1, 11)]

# Generate rows
data = []

for sensor_id in sensor_ids:
    for timestamp in timestamps:
        temperature = np.round(np.random.normal(loc=22, scale=2), 2)       # Around 22°C
        humidity = np.round(np.random.uniform(low=40, high=70), 2)         # 40%–70%
        pressure = np.round(np.random.normal(loc=1013, scale=5), 2)        # Around 1013 mbar
        data.append({
            "sensor_id": sensor_id,
            "timestamp": timestamp,
            "temperature_C": temperature,
            "humidity_percent": humidity,
            "pressure_mbar": pressure
        })

# Create DataFrame
df = pd.DataFrame(data)

# Preview
print(df.head())

# Optional: Save to CSV
df.to_csv("sensor_data.csv", index=False)
print("Sensor data saved to 'sensor_data.csv'")

  sensor_id                 timestamp  temperature_C  humidity_percent  \
0  sensor_1 2025-07-01 01:00:00+00:00          23.13             53.64   
1  sensor_1 2025-07-01 01:01:00+00:00          21.96             51.01   
2  sensor_1 2025-07-01 01:02:00+00:00          25.80             46.29   
3  sensor_1 2025-07-01 01:03:00+00:00          22.56             53.15   
4  sensor_1 2025-07-01 01:04:00+00:00          20.85             52.35   

   pressure_mbar  
0        1014.74  
1        1012.26  
2        1010.93  
3        1004.16  
4        1012.14  
Sensor data saved to 'sensor_data.csv'


In [6]:
sensor_ids = [f"sensor_{i}" for i in range(1, 11)]
sensor_ids

['sensor_1',
 'sensor_2',
 'sensor_3',
 'sensor_4',
 'sensor_5',
 'sensor_6',
 'sensor_7',
 'sensor_8',
 'sensor_9',
 'sensor_10']

In [7]:

# --- Configuration ---
source_file = "/content/sensor_data.csv"       # Input file name
sample_size = 100                     # Number of rows to sample
output_file = "sensor_sample.csv"     # Output file name

# --- Load the CSV ---
df = pd.read_csv(source_file)

# --- Randomly sample the rows ---
sample_df = df.sample(n=sample_size, random_state=32)

# --- Save the sample to a new CSV ---
sample_df.to_csv(output_file, index=False)

print(f"{sample_size} rows sampled from '{source_file}' and saved to '{output_file}'.")

100 rows sampled from '/content/sensor_data.csv' and saved to 'sensor_sample.csv'.
