Step 1: Initial Dataset Download

Objective: Download and extract the PlantVillage dataset from Kaggle.
This dataset contains over 54,000 images of healthy and diseased plant leaves,
serving as our foundational data for training the initial model. 

In [1]:
import os
import zipfile
from kaggle.api.kaggle_api_extended import KaggleApi

In [2]:
# --- Configuration ---
# Defining the root directory for our data.
DATA_DIR = '../data/'
# Specify the Kaggle dataset we want to download.
# We are using a version of the PlantVillage dataset.
DATASET_NAME = 'vipoooool/new-plant-diseases-dataset'
# Define the path where the downloaded dataset will be stored.
DOWNLOAD_PATH = os.path.join(DATA_DIR, 'plantvillage_raw')

# --- Main Execution ---

In [3]:
# 1. Initialize the Kaggle API
api = KaggleApi()
api.authenticate()
print("Kaggle API authenticated successfully.")

Kaggle API authenticated successfully.


In [4]:
# 2. Create the directory if it doesn't exist
os.makedirs(DOWNLOAD_PATH, exist_ok=True)
print(f"Directory '{DOWNLOAD_PATH}' created or already exists.")

Directory '../data/plantvillage_raw' created or already exists.


In [5]:
# 3. Download the dataset
print(f"Downloading dataset '{DATASET_NAME}'...")
api.dataset_download_files(DATASET_NAME, path=DOWNLOAD_PATH, quiet=False)
print("Download complete.")

Downloading dataset 'vipoooool/new-plant-diseases-dataset'...
Dataset URL: https://www.kaggle.com/datasets/vipoooool/new-plant-diseases-dataset
Downloading new-plant-diseases-dataset.zip to ../data/plantvillage_raw


100%|██████████| 2.70G/2.70G [00:05<00:00, 565MB/s]



Download complete.


In [3]:
# 4. Unzip the downloaded file
# The downloaded file is typically named after the dataset.
zip_file_name = DATASET_NAME.split('/')[1] + '.zip'
zip_file_path = os.path.join(DOWNLOAD_PATH, zip_file_name)

In [4]:
print(f"Unzipping '{zip_file_path}'...")
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall(DOWNLOAD_PATH)
print("Unzipping complete.")

Unzipping '../data/plantvillage_raw\new-plant-diseases-dataset.zip'...
Unzipping complete.


In [5]:
# 5. Clean up the zip file to save space
os.remove(zip_file_path)
print(f"Removed zip file: '{zip_file_path}'.")

print("\nStep 1 finished. The PlantVillage dataset is ready.")

Removed zip file: '../data/plantvillage_raw\new-plant-diseases-dataset.zip'.

Step 1 finished. The PlantVillage dataset is ready.
