<a href="https://colab.research.google.com/github/lamyse1/Data-Engineering-Projects/blob/main/week%203/DE_Week3_Exercise_1_Lamyse_Ammar.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Extract DATA**

In [40]:
import pandas as pd

# Read the CSV file directly from GitHub
patients_df = pd.read_csv('https://raw.githubusercontent.com/lamyse1/Data-Engineering-Projects/main/week%203/patients.csv')
print("Extracted Patient Data:")
print(patients_df)


Extracted Patient Data:
    patient_id             name  age  gender
0         P001      James Smith   45    Male
1         P002     Mary Johnson   32  Female
2         P003  Robert Williams   56    Male
3         P004   Patricia Brown   29  Female
4         P005       John Jones   67    Male
..         ...              ...  ...     ...
195       P196     Emily Brooks   41  Female
196       P197      Jack Fisher   29    Male
197       P198       Judith Lee   50  Female
198       P199       Sean Kelly   38    Male
199       P200  Rebecca Sanders   57  Female

[200 rows x 4 columns]


In [41]:
# Simulated API response for diagnostic data
diagnostic_data = [
    {"diagnosticid": "D001", "patientid": "P001", "test": "Blood Test", "result": "Normal"},
    {"diagnosticid": "D002", "patientid": "P002", "test": "X-Ray", "result": "Fracture"},
    {"diagnosticid": "D003", "patientid": "P003", "test": "MRI", "result": "Normal"}
]
print("Extracted Diagnostic Data:")
print(diagnostic_data)


Extracted Diagnostic Data:
[{'diagnosticid': 'D001', 'patientid': 'P001', 'test': 'Blood Test', 'result': 'Normal'}, {'diagnosticid': 'D002', 'patientid': 'P002', 'test': 'X-Ray', 'result': 'Fracture'}, {'diagnosticid': 'D003', 'patientid': 'P003', 'test': 'MRI', 'result': 'Normal'}]


# **2. Transform Data**
## Clean patient data

## Enrich diagnostic data with patient infoults.

In [42]:
#filter out patients who are younger than 40 years old for a specific study
filtered_patients_df = patients_df[patients_df['age'] >= 40]
print("Filtered Patient Data (40 years and older):")
print(filtered_patients_df)

Filtered Patient Data (40 years and older):
    patient_id               name  age  gender
0         P001        James Smith   45    Male
2         P003    Robert Williams   56    Male
4         P005         John Jones   67    Male
5         P006       Linda Garcia   40  Female
7         P008      Barbara Davis   55  Female
..         ...                ...  ...     ...
193       P194  Dorothy Patterson   48  Female
194       P195      Benjamin Ward   55    Male
195       P196       Emily Brooks   41  Female
197       P198         Judith Lee   50  Female
199       P200    Rebecca Sanders   57  Female

[127 rows x 4 columns]


In [43]:
# Convert diagnostic data into a DataFrame
diagnostic_datadf = pd.DataFrame(diagnostic_data)

diagnostic_datadf.columns = ['diagnostic_id', 'patient_id', 'test', 'result']

# Perform the join on the 'patient_id' column
diagnostic_datadf = pd.merge(diagnostic_datadf, patients_df[['patient_id', 'name', 'age', 'gender']], on='patient_id', how='left')
print("Enriched Diagnostic Data:")
print(diagnostic_datadf)



Enriched Diagnostic Data:
  diagnostic_id patient_id        test    result             name  age  gender
0          D001       P001  Blood Test    Normal      James Smith   45    Male
1          D002       P002       X-Ray  Fracture     Mary Johnson   32  Female
2          D003       P003         MRI    Normal  Robert Williams   56    Male


# **3. Load Data into MongoDB**
## • Connect to MongoDB

## • Load Patient Data into MongoDB

## • Load Diagnostic Data into MongoDB


In [44]:
!pip install pymongo



In [45]:
from pymongo import MongoClient

#Connect to MongoDB

client = MongoClient('mongodb+srv://lamyseammar:Laura9966@cluster0.pfzed.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')

# Connect to a database
db = client['healthcare_db']


In [46]:
# Create or access a collection for patient data
patient_collection = db['patient_data']

# Convert patient DataFrame to dictionary format and insert into MongoDB
patient_data = filtered_patients_df.to_dict(orient='records')
patient_collection.insert_many(patient_data)
print("Loaded Patient Data into MongoDB")


Loaded Patient Data into MongoDB


In [47]:
# Create or access a collection for diagnostic data
diagnostic_collection = db['diagnostic_data']

# Convert diagnostic DataFrame to dictionary format and insert into MongoDB
diagnostic_data = diagnostic_datadf.to_dict(orient='records')
diagnostic_collection.insert_many(diagnostic_data)
print("Loaded Diagnostic Data into MongoDB")

Loaded Diagnostic Data into MongoDB


# **4. Automate the ETL Process**

In [48]:
def extract_data():
    # Extract patient data from CSV
    patients_df = pd.read_csv('https://raw.githubusercontent.com/lamyse1/Data-Engineering-Projects/main/week%203/patients.csv')
    # Simulated API response for diagnostic data
    diagnostic_data = [
        {"diagnostic_id": "D001", "patient_id": "P001", "test": "Blood Test", "result": "Normal"},
        {"diagnostic_id": "D002", "patient_id": "P002", "test": "X-Ray", "result": "Fracture"},
        {"diagnostic_id": "D003", "patient_id": "P003", "test": "MRI", "result": "Normal"}
    ]
    diagnostics_df = pd.DataFrame(diagnostic_data)
    return patients_df, diagnostics_df

def transform_data(patients_df, diagnostics_df):
    # Filter patients older than 40
    filtered_patients_df = patients_df[patients_df['age'] >= 40]
    # Enrich diagnostic data with patient details
    enriched_diagnostics_df = pd.merge(diagnostics_df, patients_df[['patient_id', 'name', 'age', 'gender']], on='patient_id', how='left')
    return filtered_patients_df, enriched_diagnostics_df

def load_data_to_mongodb(filtered_patients_df, enriched_diagnostics_df):
    client = MongoClient('mongodb+srv://lamyseammar:Laura9966@cluster0.pfzed.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
    db = client['healthcare_db']
    # Use standardized collection names
    patient_collection = db['patients']
    diagnostic_collection = db['diagnostics']
    # Load data
    patient_collection.insert_many(filtered_patients_df.to_dict('records'))
    diagnostic_collection.insert_many(enriched_diagnostics_df.to_dict('records'))

def run_etl_pipeline():
    patients_df, diagnostics_df = extract_data()
    filtered_patients_df, enriched_diagnostics_df = transform_data(patients_df, diagnostics_df)
    load_data_to_mongodb(filtered_patients_df, enriched_diagnostics_df)
    print("ETL Pipeline executed successfully.")

# Run the ETL pipeline
run_etl_pipeline()


ETL Pipeline executed successfully.


# Data Extraction Evidence:

In [49]:
patients_df, diagnostics_df = extract_data()
print("Extracted Patient Data:\n", patients_df.head())
print("Extracted Diagnostic Data:\n", diagnostics_df.head())


Extracted Patient Data:
   patient_id             name  age  gender
0       P001      James Smith   45    Male
1       P002     Mary Johnson   32  Female
2       P003  Robert Williams   56    Male
3       P004   Patricia Brown   29  Female
4       P005       John Jones   67    Male
Extracted Diagnostic Data:
   diagnostic_id patient_id        test    result
0          D001       P001  Blood Test    Normal
1          D002       P002       X-Ray  Fracture
2          D003       P003         MRI    Normal


# Data Transformation Evidence:

In [50]:
filtered_patients_df, enriched_diagnostics_df = transform_data(patients_df, diagnostics_df)
print("Filtered Patient Data (40 years and older):\n", filtered_patients_df.head())
print("Enriched Diagnostic Data:\n", enriched_diagnostics_df.head())


Filtered Patient Data (40 years and older):
   patient_id             name  age  gender
0       P001      James Smith   45    Male
2       P003  Robert Williams   56    Male
4       P005       John Jones   67    Male
5       P006     Linda Garcia   40  Female
7       P008    Barbara Davis   55  Female
Enriched Diagnostic Data:
   diagnostic_id patient_id        test    result             name  age  gender
0          D001       P001  Blood Test    Normal      James Smith   45    Male
1          D002       P002       X-Ray  Fracture     Mary Johnson   32  Female
2          D003       P003         MRI    Normal  Robert Williams   56    Male


# Data Loading Evidence:

In [51]:
load_data_to_mongodb(filtered_patients_df, enriched_diagnostics_df)


# Retrieve and Display data from MongoDB collections after the data has been loaded

In [52]:
from pymongo import MongoClient
import pandas as pd

def verify_data_in_mongodb():
    # Connect to MongoDB
    client = MongoClient('mongodb+srv://lamyseammar:Laura9966@cluster0.pfzed.mongodb.net/?retryWrites=true&w=majority&appName=Cluster0')
    db = client['healthcare_db']

    # Fetch and display data from the 'patients' collection
    print("Verifying Patient Data in MongoDB:")
    patients_data = db.patients.find().limit(5)
    for patient in patients_data:
        print(patient)

    # Fetch and display data from the 'diagnostics' collection
    print("\nVerifying Diagnostic Data in MongoDB:")
    diagnostics_data = db.diagnostics.find().limit(5)
    for diagnostic in diagnostics_data:
        print(diagnostic)

# Call the function to display data
verify_data_in_mongodb()


Verifying Patient Data in MongoDB:
{'_id': ObjectId('67a754e47563068ab0a6e86b'), 'patient_id': 'P001', 'name': 'James Smith', 'age': 45, 'gender': 'Male'}
{'_id': ObjectId('67a754e47563068ab0a6e86c'), 'patient_id': 'P003', 'name': 'Robert Williams', 'age': 56, 'gender': 'Male'}
{'_id': ObjectId('67a754e47563068ab0a6e86d'), 'patient_id': 'P005', 'name': 'John Jones', 'age': 67, 'gender': 'Male'}
{'_id': ObjectId('67a754e47563068ab0a6e86e'), 'patient_id': 'P006', 'name': 'Linda Garcia', 'age': 40, 'gender': 'Female'}
{'_id': ObjectId('67a754e47563068ab0a6e86f'), 'patient_id': 'P008', 'name': 'Barbara Davis', 'age': 55, 'gender': 'Female'}

Verifying Diagnostic Data in MongoDB:
{'_id': ObjectId('67a754e77563068ab0a6e8ea'), 'diagnostic_id': 'D001', 'patient_id': 'P001', 'test': 'Blood Test', 'result': 'Normal', 'name': 'James Smith', 'age': 45, 'gender': 'Male'}
{'_id': ObjectId('67a754e77563068ab0a6e8eb'), 'diagnostic_id': 'D002', 'patient_id': 'P002', 'test': 'X-Ray', 'result': 'Fracture