## **Dataset 1**: Diabetes Health Records (2015 and 2016)
- Method: Use Online Database API for data scraping, Storing in MongoDB
- 2015: https://rapidapi.com/adrienpelletierlaroche/api/diabetes-file-2015
- 2016: https://rapidapi.com/adrienpelletierlaroche/api/diabetes-dataset-2016

### Data Collection using API

In [1]:
# this code block will fetch the API through the online database and get the required data
import requests

# data API base
url_2015 = "https://diabetes-file-2015.p.rapidapi.com/records"
url_2016 = "https://diabetes-dataset-2016.p.rapidapi.com/records"

# max no. of rows per API iteration is 1000
querystring = {"index":"0", "orderBy":"dataListIndex_asc", "limit":"1000"}

headers_2015 = {
	"x-rapidapi-key": "insert-api-key",
	"x-rapidapi-host": "diabetes-file-2015.p.rapidapi.com"
}

headers_2016 = {
	"x-rapidapi-key": "insert-api-key",
	"x-rapidapi-host": "diabetes-dataset-2016.p.rapidapi.com"
}

# this variable will store the entire dataset in JSON format
patient_data = []

# construct the URL and get request
response_2015 = requests.get(url_2015, headers=headers_2015, params=querystring)
response_2016 = requests.get(url_2016, headers=headers_2016, params=querystring)

if response_2015.status_code == 200 and response_2016.status_code == 200:
    data_2015 = response_2015.json().get('data')
    data_2016 = response_2016.json().get('data')
    
    total_2015 = len(data_2015)

    for record in data_2015:
        record['year'] = '2015'
        patient_data.append(record)
    for record in data_2016:
        record['dataListIndex'] = str(int(record.get('dataListIndex'))+total_2015)
        record['year'] = '2016'
        patient_data.append(record)
else:
    print("2015 data status:", response_2015.status_code)
    print("2016 data status:", response_2016.status_code)

### Storing Data in MongoDB

In [2]:
# importing required libraries for MongoDB connection, api request, json data
from pymongo import MongoClient
import pymongo

In [3]:
# initialize the connection string to MongoDB
connection_string = "mongodb://dap:dap@localhost:27017"

In [4]:
# function to establish MongoDB connection
def MongoDB_connection(connection_string):
    collection_name="diabetes_data"
    try:
        client = MongoClient(connection_string)
        db = client['dap']
        print("connected to MongoDB successfully")
        collection = db[collection_name]
        print("diabetes_data collection created")
        return collection
    except pymongo.errors.OperationFailure as e:
        print("Authentication failed:", e)
    except pymongo.errors.ConnectionFailure as e:
        print("Connection failed:", e)

patient_collection = MongoDB_connection(connection_string)

connected to MongoDB successfully
diabetes_data collection created


In [5]:
# function to insert data into MongoDB collection
def insert_patient_data(collection, data_all):
    try:
        # insert all items in protein_list at once
        result = collection.insert_many(data_all)
        print("JSON data successfully imported into 'diabetes_data' collection.")
    except Exception as e:
        print("Error inserting data:", e)

# initializing function parameters and function call
d_data = insert_patient_data(patient_collection, patient_data)

JSON data successfully imported into 'diabetes_data' collection.


### Data Definition
'dataListIndex'  # Unique identifier

'cT'             # Computed tomography (medical imaging)

'bpad'           # Diastolic blood pressure (the lower number in a blood pressure reading)

'bpan'           # Systolic blood pressure (the upper number in a blood pressure reading)

'bpaN_bis'       # Normalized or bisected version of a blood pressure measure

'bwad'           # Blood work analysis data

'bwan'           # Variation of blood work analysis normalized

'bwaN_bis'       # Normalized or adjusted version of the blood work

'bmad'           # Body mass assessment data, such as body mass index (BMI) or another measure of body weight relative to height.

'bman'           # A variation of body mass data, possibly representing a different measure or adjusted value compared to bmad.

'bmaN_bis'       # Likely a normalized or adjusted version of body mass data after a processing step, like bisecting or averaging.

'year'           # Represents the year when the data was collected