# **Interrupted Time Series Analysis Data Preprocessing**


Preparation of LSOA-Level Data for Interrupted Time Series Analysis
This script transforms the spatially-allocated LSOA prescription data into a structured format suitable for interrupted time series (ITS) analysis. The preprocessing pipeline establishes July 2022 as the intervention point, enabling evaluation of policy or clinical guideline changes on diabetes prescribing patterns across West Yorkshire.

Key preprocessing steps include:

(1) temporal variable construction with continuous time points and intervention indicators;

(2) derivation of three core clinical outcome measures: Insulin Dependency Ratio (IDR), Advanced Therapy Utilization (ATU), and Hypoglycemia Risk Management Ratio (HMR);

(3) comprehensive data quality validation including missing value assessment and time series completeness verification;

(4) pre-post intervention comparative analysis to identify preliminary trends;

(5) generation of both LSOA-level detailed data and monthly aggregated summaries.

The script creates standardized clinical indicators that reflect diabetes care quality and treatment patterns, while ensuring data integrity for subsequent statistical modeling. The resulting datasets enable robust evaluation of healthcare interventions using interrupted time series methodology, providing both individual LSOA trajectories and population-level trends for comprehensive policy impact assessment.

In [None]:
import pandas as pd
import numpy as np
from datetime import datetime

# Load LSOA-level data
df = pd.read_csv('WestYorkshire_Diabetes_Prescriptions_by_LSOA_Monthly.csv', sep='\t')

# Check basic data information
print("="*60)
print("Original data check")
print("="*60)
print(f"Data dimensions: {df.shape}")
print(f"Number of columns: {len(df.columns)}")
print(f"First few column names: {df.columns[:5].tolist()}")

# If column names are problematic, show more information
if len(df.columns) == 1:
    print("Warning: All data detected in one column, attempting to re-read...")
    # Check original data format
    with open('WestYorkshire_Diabetes_Prescriptions_by_LSOA_Monthly.csv', 'r', encoding='utf-8') as f:
        first_line = f.readline().strip()
        print(f"First line of file: {first_line}")

    # Try different separators
    for sep in [',', '\t', ';', '|']:
        try:
            df_test = pd.read_csv('WestYorkshire_Diabetes_Prescriptions_by_LSOA_Monthly.csv', sep=sep, nrows=1)
            if len(df_test.columns) > 1:
                print(f"Correct separator found: '{sep}', columns: {len(df_test.columns)}")
                df = pd.read_csv('WestYorkshire_Diabetes_Prescriptions_by_LSOA_Monthly.csv', sep=sep)
                break
        except:
            continue

print(f"Final data dimensions: {df.shape}")
print(f"Column names: {df.columns.tolist()}")

# Display first few rows
print("\nFirst 5 rows:")
print(df.head())

# Check if key columns exist
required_columns = ['LSOA_CODE', 'date', 'Short-acting insulins_items', 'Intermediate and long-acting insulins_items']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"Warning: Missing key columns: {missing_columns}")
    print("Please check data file format")
else:
    print("Key columns present")
    print(f"Time range: {df['date'].min()} to {df['date'].max()}")
    print(f"Number of LSOAs: {df['LSOA_CODE'].nunique()}")

# Time variable processing
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(['LSOA_CODE', 'date'])

# Create time series variables
start_date = df['date'].min()
df['time_point'] = ((df['date'] - start_date).dt.days / 30.44).round().astype(int) + 1

# Intervention point: July 2022
intervention_date = pd.to_datetime('2022-07-01')
df['intervention'] = (df['date'] >= intervention_date).astype(int)

# Calculate post-intervention time point sequence
intervention_time_point = ((intervention_date - start_date).days / 30.44) + 1
df['post_intervention'] = np.where(
    df['intervention'] == 1,
    df['time_point'] - intervention_time_point + 1,
    0
)

print("="*60)
print("Time variable processing results")
print("="*60)
print(f"Total time points: {df['time_point'].max()}")
print(f"Intervention time point: {intervention_time_point:.1f}")
print(f"Pre-intervention records: {(df['intervention']==0).sum()}")
print(f"Post-intervention records: {(df['intervention']==1).sum()}")

# Calculate total diabetes prescription volume
diabetes_columns = [
    'Short-acting insulins_items',
    'Intermediate and long-acting insulins_items',
    'Sulfonylureas_items',
    'Biguanides_items',
    'Other antidiabetic drugs_items'
]

df['total_diabetes_items'] = df[diabetes_columns].sum(axis=1)
df['total_insulin_items'] = (df['Short-acting insulins_items'] +
                            df['Intermediate and long-acting insulins_items'])

print("="*60)
print("Prescription volume calculation results")
print("="*60)
print("Drug category prescription statistics:")
for col in diabetes_columns:
    print(f"{col}: Total={df[col].sum():.0f}, Mean={df[col].mean():.2f}, Max={df[col].max():.0f}")

print(f"\nTotal diabetes prescriptions: Total={df['total_diabetes_items'].sum():.0f}, Mean={df['total_diabetes_items'].mean():.2f}")
print(f"Total insulin prescriptions: Total={df['total_insulin_items'].sum():.0f}, Mean={df['total_insulin_items'].mean():.2f}")

# Calculate three core indicators

# Indicator 1: Insulin Dependency Ratio (IDR)
df['IDR'] = np.where(
    df['total_diabetes_items'] > 0,
    (df['total_insulin_items'] / df['total_diabetes_items']) * 100,
    0
)

# Indicator 2: Advanced Therapy Utilization (ATU)
df['oral_medications'] = (df['Sulfonylureas_items'] +
                         df['Biguanides_items'] +
                         df['Other antidiabetic drugs_items'])
df['ATU'] = np.where(
    df['oral_medications'] > 0,
    (df['Other antidiabetic drugs_items'] / df['oral_medications']) * 100,
    0
)

# Indicator 3: Hypoglycemia Risk Management Ratio (HMR)
df['HMR'] = np.where(
    df['total_diabetes_items'] > 0,
    (df['Treatment of hypoglycaemia_items'] / df['total_diabetes_items']) * 100,
    0
)

print("="*60)
print("Core indicator calculation results")
print("="*60)
print("IDR (Insulin Dependency Ratio) statistics:")
print(f"  Mean: {df['IDR'].mean():.2f}%")
print(f"  Median: {df['IDR'].median():.2f}%")
print(f"  Standard deviation: {df['IDR'].std():.2f}%")
print(f"  Minimum: {df['IDR'].min():.2f}%")
print(f"  Maximum: {df['IDR'].max():.2f}%")
print(f"  Zero value records: {(df['IDR']==0).sum()}")

print("\nATU (Advanced Therapy Utilization) statistics:")
print(f"  Mean: {df['ATU'].mean():.2f}%")
print(f"  Median: {df['ATU'].median():.2f}%")
print(f"  Standard deviation: {df['ATU'].std():.2f}%")
print(f"  Minimum: {df['ATU'].min():.2f}%")
print(f"  Maximum: {df['ATU'].max():.2f}%")
print(f"  Zero value records: {(df['ATU']==0).sum()}")

print("\nHMR (Hypoglycemia Management Ratio) statistics:")
print(f"  Mean: {df['HMR'].mean():.2f}%")
print(f"  Median: {df['HMR'].median():.2f}%")
print(f"  Standard deviation: {df['HMR'].std():.2f}%")
print(f"  Minimum: {df['HMR'].min():.2f}%")
print(f"  Maximum: {df['HMR'].max():.2f}%")
print(f"  Zero value records: {(df['HMR']==0).sum()}")

# Calculate other useful indicators
# Short-acting insulin ratio (within total insulin)
df['short_acting_insulin_ratio'] = np.where(
    df['total_insulin_items'] > 0,
    (df['Short-acting insulins_items'] / df['total_insulin_items']) * 100,
    0
)

# Metformin ratio
df['metformin_ratio'] = np.where(
    df['total_diabetes_items'] > 0,
    (df['Biguanides_items'] / df['total_diabetes_items']) * 100,
    0
)

# Per capita prescription volume (per thousand people)
df['items_per_capita'] = df['total_diabetes_items'] / df['LSOA_POPULATION'] * 1000

print("="*60)
print("Auxiliary indicator calculation results")
print("="*60)
print(f"Short-acting insulin ratio: Mean={df['short_acting_insulin_ratio'].mean():.2f}%")
print(f"Metformin ratio: Mean={df['metformin_ratio'].mean():.2f}%")
print(f"Prescriptions per thousand people: Mean={df['items_per_capita'].mean():.2f}")

# Pre-post intervention comparison analysis
print("="*60)
print("Pre-post intervention comparison analysis")
print("="*60)

pre_intervention = df[df['intervention'] == 0]
post_intervention = df[df['intervention'] == 1]

comparison_stats = {
    'IDR': {
        'pre_mean': pre_intervention['IDR'].mean(),
        'post_mean': post_intervention['IDR'].mean(),
        'pre_median': pre_intervention['IDR'].median(),
        'post_median': post_intervention['IDR'].median()
    },
    'ATU': {
        'pre_mean': pre_intervention['ATU'].mean(),
        'post_mean': post_intervention['ATU'].mean(),
        'pre_median': pre_intervention['ATU'].median(),
        'post_median': post_intervention['ATU'].median()
    },
    'HMR': {
        'pre_mean': pre_intervention['HMR'].mean(),
        'post_mean': post_intervention['HMR'].mean(),
        'pre_median': pre_intervention['HMR'].median(),
        'post_median': post_intervention['HMR'].median()
    }
}

for indicator, stats in comparison_stats.items():
    print(f"\n{indicator} indicator:")
    print(f"  Pre-intervention mean: {stats['pre_mean']:.3f}")
    print(f"  Post-intervention mean: {stats['post_mean']:.3f}")
    print(f"  Mean change: {stats['post_mean'] - stats['pre_mean']:.3f}")
    print(f"  Pre-intervention median: {stats['pre_median']:.3f}")
    print(f"  Post-intervention median: {stats['post_median']:.3f}")
    print(f"  Median change: {stats['post_median'] - stats['pre_median']:.3f}")

# Data completeness check
print("="*60)
print("Data completeness check")
print("="*60)

# Check missing values
missing_counts = df.isnull().sum()
if missing_counts.sum() > 0:
    print("Missing values found:")
    print(missing_counts[missing_counts > 0])
else:
    print("No missing values")

# Check time continuity
lsoa_time_counts = df.groupby('LSOA_CODE')['time_point'].count()
expected_time_points = df['time_point'].max()
incomplete_lsoas = lsoa_time_counts[lsoa_time_counts < expected_time_points]

print(f"\nTime series completeness:")
print(f"  Expected time points: {expected_time_points}")
print(f"  Number of LSOAs: {df['LSOA_CODE'].nunique()}")
if len(incomplete_lsoas) > 0:
    print(f"  Warning: LSOAs with incomplete time series: {len(incomplete_lsoas)}")
    print(f"  Minimum time points: {incomplete_lsoas.min()}")
else:
    print(f"  All LSOA time series complete")

# Create period classification
df['period'] = df['intervention'].map({0: 'pre_intervention', 1: 'post_intervention'})

# Organize final dataframe
its_data = df[[
    'LSOA_CODE', 'date', 'year', 'month', 'time_point',
    'intervention', 'post_intervention', 'period',

    # Original prescription volumes
    'Short-acting insulins_items', 'Intermediate and long-acting insulins_items',
    'Sulfonylureas_items', 'Biguanides_items', 'Other antidiabetic drugs_items',
    'Treatment of hypoglycaemia_items', 'total_diabetes_items', 'total_insulin_items',

    # Core indicators
    'IDR', 'ATU', 'HMR',

    # Other indicators
    'short_acting_insulin_ratio', 'metformin_ratio', 'items_per_capita',

    # Control variables and population information
    'LSOA_POPULATION', 'NUM_GPS_SERVING', 'LSOA_NAME'
]].copy()

# Save final data
its_data.to_csv('diabetes_its_analysis_data_lsoa.csv', index=False)

print("="*60)
print("Data organization completed")
print("="*60)
print(f"Final data dimensions: {its_data.shape}")
print(f"Saved file: diabetes_its_analysis_data_lsoa.csv")

# Generate monthly summary statistics (for R analysis preparation)
monthly_summary = its_data.groupby(['date', 'year', 'month', 'time_point', 'intervention', 'post_intervention', 'period']).agg({
    'IDR': ['mean', 'median', 'std', 'count'],
    'ATU': ['mean', 'median', 'std', 'count'],
    'HMR': ['mean', 'median', 'std', 'count'],
    'total_diabetes_items': ['sum', 'mean', 'median'],
    'LSOA_POPULATION': ['sum']
}).round(3)

print("\nMonthly summary statistics preview:")
print(monthly_summary.head())

# Save monthly summary
monthly_summary.to_csv('monthly_summary_for_its.csv')
print("\nMonthly summary data saved: monthly_summary_for_its.csv")

print("\n" + "="*60)
print("Data preparation completed!")
print("Next step: Use R for ITS analysis")
print("="*60)