In [1]:
# Setting to use a GPU
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

/bin/bash: line 1: nvidia-smi: command not found


In [2]:
# Setting to Use a High-RAM
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 185.7 gigabytes of available RAM

You are using a high-RAM runtime!


In [3]:
# Import necessary libraries
import pandas as pd
import numpy as np
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# Define folder path
folder_path = '/content/drive/MyDrive/FMC_Data/M-B_dong_202303'
output_folder = '/content/drive/MyDrive/FMC_Data/M-B_data_by_purpose_domestic_residents'
os.makedirs(output_folder, exist_ok=True)

date_range = pd.date_range('2023-03-25', '2023-03-31')

for date in date_range:
    date_str = date.strftime('%Y%m%d')
    file_name = f'seoul_purpose_admdong3_{date_str}.csv'
    file_path = os.path.join(folder_path, file_name)

    if os.path.isfile(file_path):
        df = pd.read_csv(file_path, dtype={0: str, 1: str}, low_memory=False)

        # Clean data
        df = df[(df['O_ADMDONG_CD'] != '\\N') & (df['O_ADMDONG_CD'].notnull()) &
                (df['D_ADMDONG_CD'] != '\\N') & (df['D_ADMDONG_CD'].notnull())]

        df.loc[:, 'O_ADMDONG_CD'] = df['O_ADMDONG_CD'].fillna(0).astype(int)
        df.loc[:, 'D_ADMDONG_CD'] = df['D_ADMDONG_CD'].fillna(0).astype(int)
        df = df[(df['O_ADMDONG_CD'] != 0) & (df['D_ADMDONG_CD'] != 0)]

        df = df[df['IN_FORN_DIV_NM'] == "내국인"].copy()
        df.loc[:, 'date'] = date

        for purpose in range(1, 8):
            df_purpose = df[df['MOVE_PURPOSE'] == purpose]
            if not df_purpose.empty:
                output_file = os.path.join(output_folder, f'purpose_{purpose}.csv')
                # Write header only if file does not exist
                write_header = not os.path.exists(output_file)
                df_purpose.to_csv(output_file, mode='a', index=False, header=write_header)
    else:
        print(f"File not found: {file_path}")

In [8]:
date_str

'20230331'

### 4개의 지역 indicator 변수 생성

In [None]:
#@title Origin, Destination 범주 더미 변수 생성
# Path to folder
folder = '/content/drive/MyDrive/FMC_Data/M-B_data_by_purpose_domestic_residents'

for i in range(1, 8):
    file_path = os.path.join(folder, f'purpose_{i}.csv')
    print(f'Processing {file_path}...')

    # Load the data
    df = pd.read_csv(file_path)

    # Ensure codes are strings with at least two digits
    df['O_ADMDONG_CD'] = df['O_ADMDONG_CD'].astype(str).str.zfill(2)
    df['D_ADMDONG_CD'] = df['D_ADMDONG_CD'].astype(str).str.zfill(2)

    # Create indicator variables
    df['O_Seoul'] = df['O_ADMDONG_CD'].str[:2].eq('11').astype(int)
    df['D_Seoul'] = df['D_ADMDONG_CD'].str[:2].eq('11').astype(int)
    df['O_SMA'] = df['O_ADMDONG_CD'].str[:2].isin(['11', '23', '31']).astype(int)
    df['D_SMA'] = df['D_ADMDONG_CD'].str[:2].isin(['11', '23', '31']).astype(int)

    # Remove duplicate rows
    df = df.drop_duplicates()

    # Save to a new file with the modified name
    new_file_path = os.path.join(folder, f'purpose_{i}_with_OD_indicators.csv')
    df.to_csv(new_file_path, index=False)
    print(f'Saved as {new_file_path}')

In [10]:
import os
import pandas as pd

# Path to folder
folder = '/content/drive/MyDrive/FMC_Data/M-B_data_by_purpose_domestic_residents'

i = 7  # Only process purpose_7.csv
file_path = os.path.join(folder, f'purpose_{i}.csv')
print(f'Processing {file_path}...')

# Load the data
df = pd.read_csv(file_path)

# Ensure codes are strings with at least two digits
df['O_ADMDONG_CD'] = df['O_ADMDONG_CD'].astype(str).str.zfill(2)
df['D_ADMDONG_CD'] = df['D_ADMDONG_CD'].astype(str).str.zfill(2)

# Create indicator variables
df['O_Seoul'] = df['O_ADMDONG_CD'].str[:2].eq('11').astype(int)
df['D_Seoul'] = df['D_ADMDONG_CD'].str[:2].eq('11').astype(int)
df['O_SMA'] = df['O_ADMDONG_CD'].str[:2].isin(['11', '23', '31']).astype(int)
df['D_SMA'] = df['D_ADMDONG_CD'].str[:2].isin(['11', '23', '31']).astype(int)

# Remove duplicate rows
df = df.drop_duplicates()

# Save to a new file with the modified name
new_file_path = os.path.join(folder, f'purpose_{i}_with_OD_indicators.csv')
df.to_csv(new_file_path, index=False)
print(f'Saved as {new_file_path}')

Processing /content/drive/MyDrive/FMC_Data/M-B_data_by_purpose_domestic_residents/purpose_7.csv...
Saved as /content/drive/MyDrive/FMC_Data/M-B_data_by_purpose_domestic_residents/purpose_7_with_OD_indicators.csv


- 목적 24567만 남김
- O, D 동이름 결측인경우 제외
- Origin, Destination 범주 더미 변수 생성
- 서울 to 서울


In [12]:
df.head()

Unnamed: 0,O_ADMDONG_CD,D_ADMDONG_CD,ST_TIME_CD,FNS_TIME_CD,IN_FORN_DIV_NM,FORN_CITIZ_NM,MOVE_PURPOSE,MOVE_DIST,MOVE_TIME,CNT,ETL_YMD,date,O_Seoul,D_Seoul,O_SMA,D_SMA
0,11110515,11110515,0,0,내국인,한국,7,407.3,1.97,5.22,20230301,2023-03-01,1,1,1,1
1,11110515,11380590,0,0,내국인,한국,7,4474.9,17.473333,4.5,20230301,2023-03-01,1,1,1,1
2,11110515,11410615,0,0,내국인,한국,7,4069.7,7.878333,2.91,20230301,2023-03-01,1,1,1,1
3,11110530,11110515,0,0,내국인,한국,7,626.5,0.538333,2.45,20230301,2023-03-01,1,1,1,1
4,11110530,11110530,0,0,내국인,ㆍ값없음,7,434.65,0.563333,6.1,20230301,2023-03-01,1,1,1,1


In [None]:
# 각 purpose별 count 분포
# I want to see the portion of trips for each purpose
