## Detect Schema Mismatches in Data Pipelines
**Objective**: Identify and resolve schema mismatches that commonly occur in data pipelines.

**Task**: Missing Column

1. Load the source DataFrame with the below schema:
    - id : Integer
    - email : String
    - signup_date : Date
2. Load the target DataFrame with the below schema:
    - id : Integer
    - email : String
3. Implement a check to identify any columns that are present in the source DataFrame but missing in the target.
4. Add the missing `signup_date` column to the target DataFrame.

In [2]:
import pandas as pd
import unittest

def detect_missing_columns(source: pd.DataFrame, target: pd.DataFrame):
    # Basic input validation
    if source is None or target is None:
        raise ValueError("Source and target DataFrames must not be None")
    if not isinstance(source, pd.DataFrame) or not isinstance(target, pd.DataFrame):
        raise TypeError("Both source and target must be pandas DataFrame objects")
    if source.empty:
        raise ValueError("Source DataFrame is empty")
    if target.empty:
        raise ValueError("Target DataFrame is empty")
    
    source_cols = set(source.columns)
    target_cols = set(target.columns)
    
    missing_cols = source_cols - target_cols
    
    return missing_cols

def add_missing_columns_from_source(source: pd.DataFrame, target: pd.DataFrame, missing_cols: set):
    for col in missing_cols:
        # Optional: Check if data types match or convert if needed
        target[col] = source[col]
    return target

# Example usage:
source_df = pd.DataFrame({
    'id': [1, 2, 3],
    'email': ['a@example.com', 'b@example.com', 'c@example.com'],
    'signup_date': pd.to_datetime(['2023-01-01', '2023-01-02', '2023-01-03'])
})

target_df = pd.DataFrame({
    'id': [1, 2, 3],
    'email': ['a@example.com', 'b@example.com', 'c@example.com']
})

try:
    missing = detect_missing_columns(source_df, target_df)
    if missing:
        target_df = add_missing_columns_from_source(source_df, target_df, missing)
    print("Final target DataFrame:\n", target_df)
except Exception as e:
    print(f"Error: {e}")


# Unit test class
class TestSchemaMismatch(unittest.TestCase):
    
    def setUp(self):
        self.source = pd.DataFrame({
            'id': [1, 2],
            'email': ['a@example.com', 'b@example.com'],
            'signup_date': pd.to_datetime(['2023-01-01', '2023-01-02'])
        })
        self.target = pd.DataFrame({
            'id': [1, 2],
            'email': ['a@example.com', 'b@example.com']
        })

    def test_missing_columns_detection(self):
        missing = detect_missing_columns(self.source, self.target)
        self.assertEqual(missing, {'signup_date'})

    def test_no_missing_columns(self):
        target_copy = self.source.copy()
        missing = detect_missing_columns(self.source, target_copy)
        self.assertEqual(missing, set())

    def test_add_missing_columns(self):
        missing = detect_missing_columns(self.source, self.target)
        updated_target = add_missing_columns_from_source(self.source, self.target.copy(), missing)
        self.assertIn('signup_date', updated_target.columns)
        pd.testing.assert_series_equal(updated_target['signup_date'], self.source['signup_date'])

    def test_empty_source(self):
        with self.assertRaises(ValueError):
            detect_missing_columns(pd.DataFrame(), self.target)

    def test_none_input(self):
        with self.assertRaises(ValueError):
            detect_missing_columns(None, self.target)
        with self.assertRaises(ValueError):
            detect_missing_columns(self.source, None)

    def test_wrong_type_input(self):
        with self.assertRaises(TypeError):
            detect_missing_columns("not_a_df", self.target)
        with self.assertRaises(TypeError):
            detect_missing_columns(self.source, 123)

if __name__ == '__main__':
    unittest.main(argv=[''], exit=False)


......
----------------------------------------------------------------------
Ran 6 tests in 0.010s

OK


Final target DataFrame:
    id          email signup_date
0   1  a@example.com  2023-01-01
1   2  b@example.com  2023-01-02
2   3  c@example.com  2023-01-03
