In [2]:
import os
import pandas as pd
from datetime import datetime, timedelta

In [2]:
def list_files_in_directory(directory_path):
    '''
    디렉토리 경로값을 입력받으면 해당 디렉토리의 모든 파일명을 반환하는 함수
    '''
    try:
        # 경로의 모든 파일을 리스트로 생성
        files_and_directories = os.listdir(directory_path)
        
        # 다른 폴더를 제외한 파일만 남김
        files = [f for f in files_and_directories if os.path.isfile(os.path.join(directory_path, f))]
        
        return files
    
    # 예외처리
    except FileNotFoundError:
        print(f"The directory {directory_path} does not exist.")
        return []
    
    except PermissionError:
        print(f"Permission denied to access the directory {directory_path}.")
        return []
    
    except Exception as e:
        print(f"An error occurred: {e}")
        return []

In [126]:
def process_cn_data(directory_path, csv_name):
    # 피처명으로 사용하기 위해 csv파일 명칭에서 ".csv"를 제거한 스트링 생성
    col_name = csv_name[:-4]
    
    # csv 파일 읽어들인 후 저장 및 컬럼명 설정
    dataframe = pd.read_csv(directory_path + "\\" + csv_name, sep = ",", encoding='mbcs', skiprows = 2, index_col = 0).iloc[:2, :].T
    dataframe.columns = [col_name, col_name+"_acc"]
    
    for year in range(2015, 2025):
        # 각 년도별 2월, 1월의 데이터 인덱스 특정
        feb = "Feb " + str(year)
        jan = "Jan " + str(year)
        # 2월의 데이터가 NaN값인 경우
        if pd.isna(dataframe.loc[feb, col_name]):
            # 2월의 축적 데이터 1 / 2 값을 1월과 2월에 입력
            new_value = round(dataframe.loc[feb, col_name + "_acc"] / 2, 1)
            dataframe.loc[feb, col_name] = new_value
            dataframe.loc[jan, col_name] = new_value

        # 2월의 데이터가 NaN이 아닌 경우
        else:
            # 2월의 축적 데이터와 2월의 데이터의 차이를 1월 데이터에 입력
            new_value = dataframe.loc[feb, col_name+"_acc"] - dataframe.loc[feb, col_name]
            dataframe.loc[jan, col_name] = new_value
            
    # 데이터프레임에서 축적 피처 제거
    dataframe = dataframe.drop(col_name+"_acc", axis = 1)
    
    return dataframe

In [60]:
# 경로명
directory_path = r"C:\Users\ITSC\Desktop\ChrisProject\FinalProject\nbs_industry\output_of_major_industrial_products"

In [80]:
# 인덱스값 생성을 위해 임시로 만든 데이터프레임
df = pd.read_csv(directory + "\\" + file_name, sep = ",", encoding='mbcs', skiprows = 2, index_col = 0).iloc[:2, :].T

In [127]:
file_list = list_files_in_directory(directory_path)
# 데이터를 저장할 데이터프레임 생성
concat_df = pd.DataFrame(df.index).set_index(0)

for csv in file_list:
    df = process_cn_data(directory_path, csv)
    concat_df = pd.concat([concat_df, df], axis = 1)

In [128]:
# 결측치가 1개라도 있는 피처 제거
concat_df = concat_df.dropna(axis = 1)

In [129]:
concat_df

Unnamed: 0,air_conditioners_1,air_conditioners_2,alternating_current_motors,aluminum_alloy,aluminum_oxide,aluminum_products,beer,bullet_trains,cars,caustic_soda,...,tobaccos,toughened_glass,traditional_chinese_medicine,trucks,welded_steel_pipe,wine,wire_rod,xerox_and_hectograph_printing_equipment,yarn,zinc
Apr 2024,241.0,3033.1,2869.5,140.1,686.3,584.0,276.4,25.0,86.1,351.9,...,1979.7,4736.2,17.5,30.1,542.1,0.8,1075.3,19.4,187.1,58.6
Mar 2024,301.7,3083.3,3007.0,144.9,677.6,603.0,306.9,48.0,92.2,368.4,...,2021.6,5065.9,19.1,36.9,567.3,1.2,1136.0,21.3,198.9,65.1
Feb 2024,213.8,1881.8,2236.0,111.0,669.8,488.2,283.3,100.0,64.0,343.4,...,3190.2,4402.3,15.8,24.1,326.3,1.0,1057.2,18.0,168.2,62.0
Jan 2024,213.8,1881.8,2236.0,111.0,669.8,488.2,283.3,100.0,64.0,343.4,...,3190.2,4402.3,15.8,24.1,326.3,1.0,1057.2,18.0,168.2,62.0
Dec 2023,269.0,2152.9,3229.6,139.0,684.4,594.6,214.8,144.0,111.3,368.9,...,813.0,5257.1,21.4,29.1,537.4,2.2,1083.3,19.9,201.9,66.1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
May 2015,186.9,1676.9,2427.7,53.8,465.4,444.2,483.9,143.0,91.7,257.5,...,1972.1,3452.3,26.9,23.0,568.5,7.1,1333.5,61.4,330.6,53.1
Apr 2015,214.2,1798.0,2378.6,53.4,477.9,428.0,399.9,128.0,96.6,251.0,...,1920.7,3500.8,29.9,26.5,585.2,7.4,1334.7,62.5,320.9,52.2
Mar 2015,195.5,1720.9,2413.1,50.5,480.3,415.7,395.7,153.0,115.0,258.4,...,1928.4,3359.8,28.1,30.9,558.7,8.3,1310.3,57.6,315.3,49.0
Feb 2015,124.0,900.3,1663.2,38.6,412.3,311.7,282.3,168.0,83.2,237.2,...,1333.4,2406.1,23.8,23.3,334.0,7.1,992.0,60.1,247.6,46.8


In [141]:
concat_df.to_csv("./중국산업생산량.csv")