<a href="https://colab.research.google.com/github/jieun0441/XMC/blob/main/FMC_Code11_expanding_data_1year.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Setting to use a GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In [None]:
# Setting to Use a High-RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 185.7 gigabytes of available RAM

You are using a high-RAM runtime!


Key features of this code:

- Handles each month in a loop.
- For each month, reads all zipped CSVs, preprocesses and accumulates data in memory (grouped by purpose).
- After the month is loaded, adds OD indicator columns and removes duplicates.
- Saves only the final "purpose_X_with_OD_indicators.csv" file for each purpose in each month.

In [None]:
import os
import pandas as pd
import zipfile
from pathlib import Path


In [None]:
# === Settings ===
root_folder = Path('/content/drive/MyDrive/FMC_Data/M_B_dong_raw_2023')
output_root = Path('/content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents')
output_root.mkdir(exist_ok=True)

months = [f'M{str(m).zfill(2)}' for m in range(1, 13)]

for month in months:
    folder_path = root_folder / month
    output_folder = output_root / month
    output_folder.mkdir(exist_ok=True)
    print(f'Processing {month} ...')

    # Get date range for the month
    year = 2023
    month_num = int(month[1:])
    start_date = pd.Timestamp(year, month_num, 1)
    end_date = (start_date + pd.offsets.MonthEnd(1))
    date_range = pd.date_range(start_date, end_date)

    # Collect data for each purpose in a list (by purpose)
    purpose_data = {purpose: [] for purpose in range(1, 8)}

    for date in date_range:
        date_str = date.strftime('%Y%m%d')
        zip_name = f'seoul_purpose_admdong3_{date_str}.zip'
        zip_path = folder_path / zip_name
        if zip_path.is_file():
            with zipfile.ZipFile(zip_path) as z:
                # Assume only one CSV in the zip
                csv_name = z.namelist()[0]
                with z.open(csv_name) as f:
                    df = pd.read_csv(f, dtype={0: str, 1: str}, low_memory=False)

                # Clean data
                df = df[(df['O_ADMDONG_CD'] != '\\N') & (df['O_ADMDONG_CD'].notnull()) &
                        (df['D_ADMDONG_CD'] != '\\N') & (df['D_ADMDONG_CD'].notnull())]

                df.loc[:, 'O_ADMDONG_CD'] = df['O_ADMDONG_CD'].fillna(0).astype(int)
                df.loc[:, 'D_ADMDONG_CD'] = df['D_ADMDONG_CD'].fillna(0).astype(int)
                df = df[(df['O_ADMDONG_CD'] != 0) & (df['D_ADMDONG_CD'] != 0)]

                df = df[df['IN_FORN_DIV_NM'] == "내국인"].copy()
                df.loc[:, 'date'] = date

                for purpose in range(1, 8):
                    df_purpose = df[df['MOVE_PURPOSE'] == purpose]
                    if not df_purpose.empty:
                        purpose_data[purpose].append(df_purpose)
        else:
            print(f"File not found: {zip_path}")

    # After collecting all data for the month, process and save by purpose
    for purpose in range(1, 8):
        if not purpose_data[purpose]:
            continue

        df_purpose_month = pd.concat(purpose_data[purpose], ignore_index=True)

        # Ensure codes are strings with at least two digits
        df_purpose_month['O_ADMDONG_CD'] = df_purpose_month['O_ADMDONG_CD'].astype(str).str.zfill(2)
        df_purpose_month['D_ADMDONG_CD'] = df_purpose_month['D_ADMDONG_CD'].astype(str).str.zfill(2)

        # Create indicator variables
        df_purpose_month['O_Seoul'] = df_purpose_month['O_ADMDONG_CD'].str[:2].eq('11').astype(int)
        df_purpose_month['D_Seoul'] = df_purpose_month['D_ADMDONG_CD'].str[:2].eq('11').astype(int)
        df_purpose_month['O_SMA'] = df_purpose_month['O_ADMDONG_CD'].str[:2].isin(['11', '23', '31']).astype(int)
        df_purpose_month['D_SMA'] = df_purpose_month['D_ADMDONG_CD'].str[:2].isin(['11', '23', '31']).astype(int)

        # Remove duplicate rows
        df_purpose_month = df_purpose_month.drop_duplicates()

        new_file_path = output_folder / f'purpose_{purpose}_with_OD_indicators.csv'
        df_purpose_month.to_csv(new_file_path, index=False)
        print(f'Saved as {new_file_path}')

Processing M01 ...
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M01/purpose_1_with_OD_indicators.csv
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M01/purpose_2_with_OD_indicators.csv
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M01/purpose_3_with_OD_indicators.csv
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M01/purpose_4_with_OD_indicators.csv
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M01/purpose_5_with_OD_indicators.csv
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M01/purpose_6_with_OD_indicators.csv
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M01/purpose_7_with_OD_indicators.csv
Processing M02 ...
Saved as /content/drive/MyDrive/FMC_Data/M_B_dong_data_by_purpose_domestic_residents/M02/purpose_1_with_OD_i