## Combines data from all years (1999-2021) into one big file

In [None]:
import pandas as pd
from pathlib import Path

In [None]:
columns = [
    "C_YEAR","C_MNTH","C_WDAY","C_HOUR","C_SEV","C_VEHS","C_CONF","C_RCFG",
    "C_WTHR","C_RSUR","C_RALN","C_TRAF","V_ID","V_TYPE","V_YEAR","P_ID",
    "P_SEX","P_AGE","P_PSN","P_ISEV","P_SAFE","P_USER","C_CASE"
]

df = pd.DataFrame(columns=columns)

In [None]:

def read_file(file_path, columns=None):
    ext = Path(file_path).suffix.lower()

    if ext == '.csv':
        sample = pd.read_csv(file_path, nrows=1, header=None)
    elif ext in ['.xls', '.xlsx']:
        sample = pd.read_excel(file_path, nrows=1, header=None)
    else:
        raise ValueError(f"Unsupported file type: {file_path}")

    c_year_index = None
    for i, val in enumerate(sample.iloc[0]):
        if str(val).strip().upper() == "C_YEAR":
            c_year_index = i
            break

    # determine whether to ignore headers (some files have headers, some don't)
    has_header = c_year_index is not None

    header = 0 if has_header else None
    names = None if has_header else columns

    if ext == '.csv':
        return pd.read_csv(file_path, header=header, names=names)
    else:
        return pd.read_excel(file_path, header=header, names=names)

df_combined = pd.DataFrame(columns=columns)

for f in Path("data/raw").iterdir():
    print(f)
    if f.suffix.lower() in ['.csv', '.xlsx']:
        df_combined = pd.concat([df_combined, read_file(f)], ignore_index=True)

In [None]:
data_dir = Path('data')
df_combined.to_csv(data_dir / 'combined_data.csv', index=False)

In [None]:
df_combined.shape # should be (4677921, 23) I think