# Infectious Disease Prediction Test

This notebook tests the download and loading of the Infectious Disease Prediction dataset from Kaggle.

In [1]:
# Install required packages
%pip install kagglehub pandas matplotlib seaborn numpy

Note: you may need to restart the kernel to use updated packages.


In [2]:
import kagglehub
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import shutil

# Download the dataset to current directory
print("Downloading Infectious Disease Prediction dataset...")
cache_path = kagglehub.dataset_download("haithemhermessi/infectious-disease-prediction")

# Copy to current folder
current_dir = os.getcwd()
local_data_dir = current_dir
os.makedirs(local_data_dir, exist_ok=True)

# Copy files from cache to local directory
for file in os.listdir(cache_path):
    src = os.path.join(cache_path, file)
    dst = os.path.join(local_data_dir, file)
    shutil.copy2(src, dst)
    print(f"Copied {file} to local data folder")

path = local_data_dir
print(f"Dataset available locally at: {path}")

  from .autonotebook import tqdm as notebook_tqdm


Downloading Infectious Disease Prediction dataset...
Copied train.csv to local data folder
Copied test.csv to local data folder
Dataset available locally at: /home/tamim/ai4ph/data/infectious_disease_prediction


In [3]:
# Check dataset size
total_size = 0
files = os.listdir(path)
for file in files:
    file_path = os.path.join(path, file)
    size = os.path.getsize(file_path)
    total_size += size
    print(f"- {file}: {size/1024:.1f} KB")

print(f"\nTotal dataset size: {total_size/(1024*1024):.2f} MB")

- train.csv: 5414.1 KB
- README.md: 1.0 KB
- test.csv: 1347.3 KB
- dataset_download.ipynb: 9.2 KB

Total dataset size: 6.61 MB


In [4]:
# Load and analyze the dataset
csv_files = [f for f in files if f.endswith('.csv')]
if csv_files:
    df = pd.read_csv(os.path.join(path, csv_files[0]))
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"Sample data:\n{df.head()}")
else:
    print("No CSV files found")

Dataset shape: (75614, 10)
Columns: ['Unnamed: 0', 'Disease', 'County', 'Year', 'Sex', 'Count', 'Population', 'Rate', 'CI.lower', 'CI.upper']
Sample data:
   Unnamed: 0        Disease       County  Year     Sex  Count  Population  \
0       88976      Pertussis         Mono  2001  Female      0        6031   
1      127888   Tuberculosis  Santa Clara  2004  Female     85      840509   
2       62206            HIV       Merced  2003  Female      0      114558   
3       99748  Rabies, human     Siskiyou  2007    Male      0       22227   
4       37088         Dengue        Butte  2013  Female      0      111548   

     Rate  CI.lower  CI.upper  
0   0.000     0.000    61.147  
1  10.113     8.078    12.505  
2   0.000     0.000     3.220  
3   0.000     0.000    16.595  
4   0.000     0.000     3.307  


In [5]:
print("✅ Dataset download and setup completed successfully!")
print("\nDataset Summary:")
print(f"- Location: {path}")
print(f"- Size: {total_size/(1024*1024):.2f} MB")
if csv_files:
    print(f"- Records: {df.shape[0]:,}")
    print(f"- Features: {df.shape[1]}")
print("- Use case: Disease outbreak prediction, epidemiological modeling")
print("- Perfect for: Module 7 (Public Health Modeling), Module 8 (Surveillance)")

✅ Dataset download and setup completed successfully!

Dataset Summary:
- Location: /home/tamim/ai4ph/data/infectious_disease_prediction
- Size: 6.61 MB
- Records: 75,614
- Features: 10
- Use case: Disease outbreak prediction, epidemiological modeling
- Perfect for: Module 7 (Public Health Modeling), Module 8 (Surveillance)
