# Data Merging & Integration

## Overview
Combining multiple datasets is a key step in data analysis. This notebook demonstrates how to efficiently merge datasets using different join techniques, ensuring consistency and completeness in the final dataset.

Key steps include:
- Understanding different types of joins (`inner`, `left`, `right`, `outer`)
- Merging datasets based on common keys
- Handling duplicate records after merging
- Checking for inconsistencies in merged datasets

---

## Importing Required Libraries
This section loads the necessary libraries for merging datasets efficiently.

In [None]:
# Import Libraries
import os
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import scipy

from IPython.display import display, HTML

# Force Jupyter Notebook to use all available horizontal space
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.width', 1000)          # Set width to a large number
pd.set_option('display.max_colwidth', None)     # Show full column content if needed
pd.set_option('display.float_format', '{:,.2f}'.format)  # Format numbers with 2 decimal places

In [None]:
# Get the relative path of the directory where this script/notebook is located.
script_dir = os.getcwd()  # or wherever your notebook is running

# Go one level up (to the parent folder) and then into "02 Data".
data_folder = os.path.join(script_dir, '..', '02 Data')
input_path = os.path.join(data_folder, 'Processed_data')
output_path = os.path.join(data_folder, 'Merged_data')
summary_report_path = os.path.join(output_path, 'summary_report.txt')

# Create the output folder if it doesn't exist
os.makedirs(output_path, exist_ok=True)

print("Data folder:", data_folder)
print("Input path:", input_path)
print("Output path:", output_path)
print("Summary report path:", summary_report_path)

In [None]:
# Verify the input folder exists and list available files.
if not os.path.exists(input_path):
    print(f"Error: The folder '{input_path}' does not exist. Please ensure the base folder is correct.")
else:
    available_files = [f for f in os.listdir(input_path)]
    print("Available files in the input folder:")
    for idx, f in enumerate(available_files, start=1):
        print(f"{idx}. {f}")
    
    file_numbers_input = input(
        "\nEnter the file numbers to process (comma-separated), or leave blank to process all files: "
    ).strip()
    
    if file_numbers_input:
        try:
            indices = [int(num.strip()) for num in file_numbers_input.split(',') if num.strip()]
            # Validate indices and build the list of selected files.
            selected_files = [available_files[i-1] for i in indices if 1 <= i <= len(available_files)]
            if not selected_files:
                print("No valid file numbers were entered.")
        except ValueError:
            print("Error: Please enter valid numbers separated by commas.")
            selected_filest = []
    else:
        selected_files = available_files

    print("\nFiles selected for processing:", selected_files)

## Loading Datasets
Here, we import the datasets that will be merged. Each dataset contains unique attributes that will be combined based on common identifiers.

In [None]:
current_file = selected_files[0]
file_path = os.path.join(input_path, current_file)

# Load files into DataFrames and display heads
df = {}
for file in selected_files:
    file_path = os.path.join(input_path, file)
    if os.path.exists(file_path):
        # Determine file format and load accordingly
        if file.endswith('.csv'):
            df[file] = pd.read_csv(file_path)
        elif file.endswith('.pkl'):
            df[file] = pd.read_pickle(file_path)
        else:
            print(f"Skipping unsupported file format: {file}")
            continue  # Skip unsupported formats
        
        print(f"Loaded file:\n{file} (rows: {df[file].shape[0]}, columns: {df[file].shape[1]})")
        # print(df[file].head())  # Display first few rows
        print("="*100 + "\n")
    else:
        print(f"File {file} not found and will be skipped.")

In [None]:
# Prompt user to choose operation type
operation_type = input("Choose operation (merge/concatenate): ").strip().lower()

In [None]:
# present all imported DataFrames

for file_name, data in df.items():
    html = data.to_html(max_rows=2, max_cols=30)
    display(HTML(f'<h4>{file_name}</h4><div style="overflow-x: auto; width:100%;">{html}</div>'))

## Types of Joins in Pandas
The following merging methods are used:
- **Inner Join**: Includes only matching records in both datasets
- **Left Join**: Includes all records from the left dataset and matching ones from the right
- **Right Join**: Includes all records from the right dataset and matching ones from the left
- **Outer Join**: Includes all records from both datasets, filling in missing values where necessary

In [None]:
if operation_type == "merge":
    common_key = input("Enter the key column to use for merging: ").strip()
    merge_type = input("Choose merge type (inner, outer, left, right): ").strip()
    
    # Initialize merged DataFrame
    merged_df = None

    for idx, (file, df_subset) in enumerate(df.items()):
        if idx == 0:
            merged_df = df_subset  # First file remains as base
        else:
            merged_df = merged_df.merge(df_subset, on=common_key, how=merge_type, indicator=True)

from IPython.display import display, HTML

# Check merge effectiveness
if '_merge' in merged_df.columns:
    print("\nMerged dataset preview:")
    
    # Convert DataFrame to HTML with horizontal scrolling
    html = merged_df.to_html(max_rows=2, max_cols=20)  # Adjust limits as needed
    display(HTML(f'<div style="overflow-x: auto; width:100%;">{html}</div>'))
    
    # Calculate merge statistics
    merge_counts = merged_df['_merge'].value_counts()
    merge_percentages = merged_df['_merge'].value_counts(normalize=True) * 100
    
    # Display merge match percentage along with count
    print("\n" + "="*100)
    print("\nMerge match percentage and count:\n")
    merge_stats = pd.DataFrame({'Count': merge_counts, 'Percentage (%)': merge_percentages})
    print(merge_stats)

    # Keep the merge flag for analysis but drop if not needed
    # merged_df.drop(columns=['_merge'], inplace=True)  # Uncomment to remove after check

In [None]:
if operation_type == "concatenate":
    # Concatenate all selected DataFrames
    concatenated_df = pd.concat(df.values(), ignore_index=True)

    # Display basic info
    print(f"Concatenated DataFrame (rows: {concatenated_df.shape[0]}, columns: {concatenated_df.shape[1]})")
    print(concatenated_df.head())

## Post-Merge Data Integrity Check
After merging, it’s important to check:
- The total number of rows and columns
- Whether duplicate records have been created
- If any data is missing due to the join operation

In [None]:
# Use only the main imported dataset's row count
main_file = selected_files[0]
total_imported_rows = df[main_file].shape[0]

if operation_type == "merge":
    exported_rows = merged_df.shape[0]
elif operation_type == "concatenate":
    exported_rows = concatenated_df.shape[0]

# Display the row counts
print(f"Total rows in main imported dataset: {total_imported_rows}")  
print(f"Final dataset rows: {exported_rows}\n")  

# Warn if there is a mismatch
if total_imported_rows != exported_rows:
    print(f"⚠️ Warning: Shape mismatch! \nMain imported rows: {total_imported_rows} \nExported rows: {exported_rows}")
else:
    print(f"✅ Shape consistency check passed:\nMain imported rows: {total_imported_rows} \nExported rows: {exported_rows}")

In [None]:
# Ask user for file format preference: CSV or pkl
file_format = input("Enter desired output file format (csv or pkl): ").strip().lower()
while file_format not in ['csv', 'pkl']:
    file_format = input("Invalid format. Please enter 'csv' or 'pkl': ").strip().lower()

# Prompt the user for the file name (without extension)
output_filename = input("Enter the desired file name (without extension): ").strip()
output_file = os.path.join(output_path, f"{output_filename}.{file_format}")

# Save the processed DataFrame in the selected format
# to be updated with a list of available data set in memory
if file_format == 'csv':
    data.to_csv(output_file, index=False)
elif file_format == 'pkl':
    data.to_pickle(output_file)

print(f"\n✅ Processed file saved to: {output_file}")
report_details.append(f"Processed file saved to: {output_file}")
report_details.append(f"Total rows in the exported file: {len(data)}")

# Update the stored data frame for the current file
df[current_file] = data

In [None]:
# Generate summary report
with open(summary_report_path, 'w') as report:
    report.write("Data Processing Report\n")
    report.write("============================\n")
    report.write(f"Files processed: {', '.join(selected_files)}\n")
    report.write(f"Operation type: {operation_type}\n")
    
    if operation_type == "merge":
        report.write(f"Key column: {common_key}\n")
        report.write(f"Merge type: {merge_type}\n")
        if '_merge' in merged_df.columns:
            report.write("Merge match percentage:\n")
            report.write(str(merge_counts) + "\n")
    
    report.write(f"File saved at: {output_file}\n")

print(f"Summary report saved: {summary_report_path}")