<a href="https://colab.research.google.com/github/kebscharry/CharityMomanyi/blob/main/Datawrangling2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

# Sample health data
data = {
    'Patient ID': [1, 2, 3, 4],
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Gender': ['Female', 'Male', 'Other', 'Unknown'],  # "Unknown" is invalid
    'Blood Type': ['A+', 'B-', 'O+', 'AB-'],  # All valid
    'Diagnosis Code': ['A00', 'B99', 'C30', 'XYZ']  # "XYZ" is invalid
}

df = pd.DataFrame(data)
print(df)

   Patient ID     Name   Gender Blood Type Diagnosis Code
0           1    Alice   Female         A+            A00
1           2      Bob     Male         B-            B99
2           3  Charlie    Other         O+            C30
3           4    David  Unknown        AB-            XYZ


In [2]:
# Define valid values for each field
valid_genders = {'Male', 'Female', 'Other'}
valid_blood_types = {'A+', 'A-', 'B+', 'B-', 'O+', 'O-', 'AB+', 'AB-'}
valid_diagnosis_codes = {'A00', 'B99', 'C30'}  # Example ICD-10 codes

In [3]:
# Custom function to check membership constraints
def check_membership(value, valid_set):
    if value not in valid_set:
        return False
    return True

# Function to apply constraints and find invalid rows
def validate_health_data(df):
    invalid_rows = []
    for index, row in df.iterrows():
        errors = {}
        if not check_membership(row['Gender'], valid_genders):
            errors['Gender'] = row['Gender']
        if not check_membership(row['Blood Type'], valid_blood_types):
            errors['Blood Type'] = row['Blood Type']
        if not check_membership(row['Diagnosis Code'], valid_diagnosis_codes):
            errors['Diagnosis Code'] = row['Diagnosis Code']

        if errors:
            invalid_rows.append({'index': index, 'errors': errors})

    return invalid_rows

# Validate the DataFrame
invalid_entries = validate_health_data(df)

# Print invalid entries
for entry in invalid_entries:
    print(f"Row {entry['index']} has invalid data: {entry['errors']}")

# Output:
# Row 3 has invalid data: {'Gender': 'Unknown', 'Diagnosis Code': 'XYZ'}


Row 3 has invalid data: {'Gender': 'Unknown', 'Diagnosis Code': 'XYZ'}


In [4]:
# Option 1: Remove invalid rows
df_cleaned = df.drop(index=[entry['index'] for entry in invalid_entries])

# Option 2: Replace invalid values with defaults
for entry in invalid_entries:
    for field in entry['errors']:
        if field == 'Gender':
            df.at[entry['index'], field] = 'Other'  # Default gender
        elif field == 'Diagnosis Code':
            df.at[entry['index'], field] = 'A00'  # Default diagnosis code

print("\nCleaned DataFrame:")
print(df_cleaned)



Cleaned DataFrame:
   Patient ID     Name  Gender Blood Type Diagnosis Code
0           1    Alice  Female         A+            A00
1           2      Bob    Male         B-            B99
2           3  Charlie   Other         O+            C30


In [5]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# Sample health data
data = {
    'Age': [25, 30, 35, 40, 45],
    'Weight (kg)': [70, 80, 85, 90, 95],
    'Height (cm)': [175, 180, 170, 160, 165],
    'Blood Pressure (systolic)': [120, 130, 140, 150, 160]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Initialize the Min-Max Scaler
scaler = MinMaxScaler()

# Fit and transform the data
df_normalized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

print("Original Data:")
print(df)
print("\nNormalized Data:")
print(df_normalized)

Original Data:
   Age  Weight (kg)  Height (cm)  Blood Pressure (systolic)
0   25           70          175                        120
1   30           80          180                        130
2   35           85          170                        140
3   40           90          160                        150
4   45           95          165                        160

Normalized Data:
    Age  Weight (kg)  Height (cm)  Blood Pressure (systolic)
0  0.00          0.0         0.75                       0.00
1  0.25          0.4         1.00                       0.25
2  0.50          0.6         0.50                       0.50
3  0.75          0.8         0.00                       0.75
4  1.00          1.0         0.25                       1.00


Filter method variable selection

In [6]:
import pandas as pd
import numpy as np

# Sample data
data = {
    'Feature1': [1, 2, 3, 4, 5],
    'Feature2': [2, 3, 4, 5, 6],
    'Feature3': [5, 4, 3, 2, 1],
    'Target': [1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)

# Compute correlation matrix
correlation_matrix = df.corr()
print("Correlation Matrix:")
print(correlation_matrix)

# Select features with high correlation with the target
target_corr = correlation_matrix['Target'].abs().sort_values(ascending=False)
print("\nFeatures by correlation with Target:")
print(target_corr)


Correlation Matrix:
          Feature1  Feature2  Feature3  Target
Feature1       1.0       1.0      -1.0     0.0
Feature2       1.0       1.0      -1.0     0.0
Feature3      -1.0      -1.0       1.0     0.0
Target         0.0       0.0       0.0     1.0

Features by correlation with Target:
Target      1.0
Feature1    0.0
Feature2    0.0
Feature3    0.0
Name: Target, dtype: float64
