In [75]:
import os
import pandas as pd
from glob import glob
from io import StringIO 

In [76]:
def parse_division_file(file_path):
    """
    Parse a parliamentary division vote file using pandas.
    Extract metadata and load the CSV portion directly.
    
    Parameters:
    file_path (str): Path to the division vote file
    
    Returns:
    pandas.DataFrame: DataFrame containing all votes with division metadata
    """
    # Read the entire file
    with open(file_path, 'r', encoding='utf-8') as f:
        lines = f.readlines()
    
    # Extract metadata
    metadata = {}
    for line in lines[:10]:  # Check the first 10 lines for metadata
        if 'Division Number:' in line:
            metadata['Division Number'] = line.split('Division Number:')[1].strip()
        elif 'Division Date:' in line:
            metadata['Division Date'] = line.split('Division Date:')[1].strip()
        elif 'Aye Count:' in line:
            metadata['Aye Count'] = int(line.split('Aye Count:')[1].strip())
        elif 'No Count:' in line:
            metadata['No Count'] = int(line.split('No Count:')[1].strip())
        elif 'Result:' in line:
            metadata['Result'] = line.split('Result:')[1].strip()
    
    # Topic is typically on line 4
    metadata['Division Topic'] = lines[3].strip()
    
    # Find the CSV header line
    header_index = 10
    
    # Extract CSV content
    csv_content = ''.join(lines[header_index:])

    df = pd.read_csv(StringIO(csv_content))
    
    # Add metadata as columns
    for key, value in metadata.items():
        df[key] = value
        
    # Add filename
    df['Source File'] = os.path.basename(file_path)

        
    return df

def process_all_files(directory_path, pattern='*.csv'):
    file_paths = glob(os.path.join(directory_path, pattern))
    print(f"Found {len(file_paths)} files")
    
    # Process each file and collect DataFrames
    all_dfs = []
    for i, file_path in enumerate(file_paths):
        try:
            df = parse_division_file(file_path)
            if not df.empty:
                all_dfs.append(df)
            
            # Show progress
            if (i+1) % 100 == 0 or i+1 == len(file_paths):
                print(f"Processed {i+1}/{len(file_paths)} files")
                
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
    
    # Combine all DataFrames
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print(f"Combined DataFrame has {len(combined_df)} rows")
    return combined_df

In [77]:
df = parse_division_file('parliament_divisions/Division1480.csv')

In [78]:
df

Unnamed: 0,Member,Party,Constituency,Vote,Proxy Member,Division Number,Division Date,Aye Count,No Count,Result,Division Topic,Source File
0,Diane Abbott,Labour,Hackney North and Stoke Newington,No Vote Recorded,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
1,Debbie Abrahams,Labour,Oldham East and Saddleworth,Aye,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
2,Nigel Adams,Conservative,Selby and Ainsty,No,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
3,Bim Afolami,Conservative,Hitchin and Harpenden,No,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
4,Adam Afriyie,Conservative,Windsor,No Vote Recorded,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
...,...,...,...,...,...,...,...,...,...,...,...,...
644,Jeremy Wright,Conservative,Kenilworth and Southam,No,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
645,Mohammad Yasin,Labour,Bedford,Aye,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
646,Jacob Young,Conservative,Redcar,No,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv
647,Nadhim Zahawi,Conservative,Stratford-on-Avon,No Vote Recorded,,174,7 February 2023 - 16:24,210,278,Question accordingly disagreed.,Seafarers' Wages Bill [Lords] Report Stage: Ne...,Division1480.csv


In [79]:
df = process_all_files('parliament_divisions')

Found 1886 files
Processed 100/1886 files
Processed 200/1886 files
Processed 300/1886 files
Processed 400/1886 files
Processed 500/1886 files
Processed 600/1886 files
Processed 700/1886 files
Processed 800/1886 files
Processed 900/1886 files
Processed 1000/1886 files
Processed 1100/1886 files
Processed 1200/1886 files
Processed 1300/1886 files
Processed 1400/1886 files
Processed 1500/1886 files
Processed 1600/1886 files
Processed 1700/1886 files
Processed 1800/1886 files
Processed 1886/1886 files
Combined DataFrame has 1224562 rows


In [80]:
df = df.drop(columns=['Diane Abbott','Labour','Hackney North and Stoke Newington','Aye','Unnamed: 4'])

In [81]:
def convert_date(date_str):
    try:
        # First try the full format with time
        return pd.to_datetime(date_str, format='%d %B %Y - %H:%M')
    except ValueError:
        try:
            # Try just the date format
            return pd.to_datetime(date_str, format='%d %B %Y')
        except ValueError:
            # If all else fails, use a flexible parser
            return pd.to_datetime(date_str, dayfirst=True, errors='coerce')

df['Division Date'] = df['Division Date'].apply(convert_date)

In [89]:
df["File Number"] = df["Source File"].str.extract(r'(\d+)').astype(int)

In [90]:
df.to_csv('parliament_divisions_combined.csv', index=False)