In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from datetime import datetime

In [20]:
# csv파일 읽기
input_file = '서울대기오염_2019.xlsx - Sheet1.csv'
df = pd.read_csv(input_file)
df.head()

Unnamed: 0,날짜,측정소명,미세먼지,초미세먼지,오존,이산화질소\nNO2 (ppm),일산화탄소\nCO (ppm),아황산가스\nSO2(ppm)
0,전체,평균,42.0,25.0,0.025,0.028,0.5,0.004
1,2019-12-31,평균,26.0,15.0,0.022,0.016,0.4,0.003
2,2019-12-31,강남구,22.0,14.0,0.025,0.014,0.4,0.003
3,2019-12-31,강동구,27.0,19.0,0.019,0.02,0.4,0.003
4,2019-12-31,강북구,31.0,17.0,0.022,0.022,0.4,0.002


In [29]:
# 분석변수만 추출 및 컬럼명 변경
df.columns= ['Date', 'District', 'PM10', 'PM2.5', 'Ozone', 'NO2', 'CO', 'SO2']

required_col = ['Date', 'District', 'PM10', 'PM2.5']

required_df = df[required_col]
required_df = required_df[required_df['District'] != '평균']
required_df.head()


Unnamed: 0,Date,District,PM10,PM2.5
2,2019-12-31,강남구,22.0,14.0
3,2019-12-31,강동구,27.0,19.0
4,2019-12-31,강북구,31.0,17.0
5,2019-12-31,강서구,29.0,16.0
6,2019-12-31,관악구,36.0,18.0


In [30]:
# 결측치 확인 및 제거
total_nans = required_df.isna().sum()

required_df.fillna(required_df.median(numeric_only=True), inplace=True)
required_df.tail()


Unnamed: 0,Date,District,PM10,PM2.5
9486,2019-01-01,용산구,30.0,24.0
9487,2019-01-01,은평구,33.0,22.0
9488,2019-01-01,종로구,34.0,21.0
9489,2019-01-01,중구,36.0,25.0
9490,2019-01-01,중랑구,34.0,25.0


In [31]:
# 자료형 변환: 문자형 → 날짜형, 실수형 등

required_df['Date'] = pd.to_datetime(required_df['Date'], format='%Y-%m-%d', errors='coerce')
numeric_col = ['PM10', 'PM2.5']
for col in required_df[numeric_col]:
    required_df[col].astype(float)

required_df.head()


Unnamed: 0,Date,District,PM10,PM2.5
2,2019-12-31,강남구,22.0,14.0
3,2019-12-31,강동구,27.0,19.0
4,2019-12-31,강북구,31.0,17.0
5,2019-12-31,강서구,29.0,16.0
6,2019-12-31,관악구,36.0,18.0


In [None]:
# [2-1] month, day 파생변수 생성
required_df['Month'] = required_df['Date'].dt.month
required_df['Day'] = required_df['Date'].dt.day
required_df.head()


# [2-2] 계절(season) 변수 생성: month 기준으로 spring/summer/autumn/winter
for idx, row in enumerate(required_df['Month']):
    if row <= 3:
        required_df.loc[idx, 'Season'] = "Spring"
    elif row <= 6:
        required_df.loc[idx, 'Season'] = "Summer"
    elif row <= 9:
        required_df.loc[idx, 'Season'] = "Autumn"
    else:
        required_df.loc[idx, 'Season'] = "Winter"


array(['Winter', 'Autumn', 'Summer', 'Spring'], dtype=object)

In [44]:
# [3-1] 최종 분석 대상 데이터 확인
# [3-2] 'card_output.csv'로 저장 (GitHub에 업로드 or 구글 드라이브 공유)

required_df.to_csv("card_output.csv", index=False)

