# Download and unzip the data 

We use the dataset hosted on [Mendeley data](https://data.mendeley.com/datasets/rscbjbr9sj/2). We download it go gdrive, and unzip it for our use. 


After running this notebook once, in any notebook that we need to check the count of files, we can run the following code after mounting gdrive:

```
%%bash
cd gdrive/MyDrive/fourthbrain-capstone 
PYTHONPATH=. pytest tests/test_file_count.py
```




In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

import pandas as pd
import os
import tarfile
import zipfile


Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
def unzip_file(extract_path, zip_path):
  # This function will unzip the file in path 
  if '.zip' in zip_path:
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
      zip_ref.extractall(extract_path)
  elif 'tar.gz' in zip_path:
    # open file
    file = tarfile.open(zip_path)
    # extract files
    file.extractall(extract_path)  
    # close file
    file.close()  

def download_data_in_gdrive(path, download_link,zip_path):
  # This function will check for path in gdrive and creates it if it doesn't exist
  # It will then download the file at download_link with the name zip_path
  dir_exists = os.path.isdir(path)
  if dir_exists:
    print('dir exists; quitting...')
    print(dir_exists)
    pass
  else:
    os.mkdir(path)
    print('downloading the dataset...\n')
    !wget -cP $path  $download_link -O $zip_path

    print('unzipping...\n')
    unzip_file(path,zip_path)


## OCT data

In [None]:
oct_path = "gdrive/MyDrive/fourthbrain-capstone/data/oct/"
oct_link = 'https://data.mendeley.com/public-files/datasets/rscbjbr9sj/files/5699a1d8-d1b6-45db-bb92-b61051445347/file_downloaded'
oct_zip_file = 'OCT2017.tar.gz'
oct_zip_path = os.path.join(oct_path, oct_zip_file)

download_data_in_gdrive(oct_path, oct_link,oct_zip_path)

### Check file counts

In [None]:
def test_file_count():
  BASE_DATA_PATH = '/content/gdrive/MyDrive/fourthbrain-capstone/data/oct/OCT2017/'
  TRAIN_BASE_PATH = os.path.join(BASE_DATA_PATH, 'train/')
  TEST_BASE_PATH = os.path.join(BASE_DATA_PATH, 'test/')

  file_count_dict = {'train/CNV':37205, 'train/DME': 11348, 'train/DRUSEN':8616, 'train/NORMAL': 26315, 
                     'test/CNV': 250, 'test/DME': 250, 'test/DRUSEN': 250, 'test/NORMAL': 250}


  parents = ['train', 'test']
  children = ['CNV', 'DME', 'DRUSEN', 'NORMAL']

  # create a base dataframe from the directory items

  for p in parents:
      for c in children:
          files = os.listdir(os.path.join(BASE_DATA_PATH, p, c))
          instance_type = [ p for i in range(len(files)) ]
          path = [ f'/OCT2017/{p}/{c}' for i in range(len(files))]
          assert file_count_dict[p+'/'+c]==len(files), '{} should have {} files but it has {} files'.format(p+'/'+c, file_count_dict[p+'/'+c], len(files))
  print('file counts are correct!')
  
test_file_count()

file counts are correct!


## Chest data (to run later when we need)

In [None]:
chest_path = "gdrive/MyDrive/fourthbrain-capstone/data/chest/"
chest_link = 'https://data.mendeley.com/public-files/datasets/rscbjbr9sj/files/f12eaf6d-6023-432f-acc9-80c9d7393433/file_downloaded'
chest_zip_file = 'ChestXRay2017.zip'
chest_zip_path = os.path.join(chest_path, chest_zip_file)

download_data_in_gdrive(chest_path, chest_link,chest_zip_path)

downloading the dataset...

--2022-06-21 00:41:28--  https://data.mendeley.com/public-files/datasets/rscbjbr9sj/files/f12eaf6d-6023-432f-acc9-80c9d7393433/file_downloaded
Resolving data.mendeley.com (data.mendeley.com)... 162.159.133.86, 162.159.130.86
Connecting to data.mendeley.com (data.mendeley.com)|162.159.133.86|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://md-datasets-public-files-prod.s3.eu-west-1.amazonaws.com/31ab5ede-ed34-46d4-b1bf-c63d70411497 [following]
--2022-06-21 00:41:29--  https://md-datasets-public-files-prod.s3.eu-west-1.amazonaws.com/31ab5ede-ed34-46d4-b1bf-c63d70411497
Resolving md-datasets-public-files-prod.s3.eu-west-1.amazonaws.com (md-datasets-public-files-prod.s3.eu-west-1.amazonaws.com)... 52.218.46.154
Connecting to md-datasets-public-files-prod.s3.eu-west-1.amazonaws.com (md-datasets-public-files-prod.s3.eu-west-1.amazonaws.com)|52.218.46.154|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: