In [1]:
import pandas as pd
import numpy as np

import re
from itertools import chain
from collections import Counter

from google.cloud import bigquery
from google.oauth2 import service_account

from datetime import datetime, timedelta

import requests
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# InsecureRequestWarning 경고 무시
warnings.simplefilter('ignore', InsecureRequestWarning)

In [2]:
KEY_PATH = ".config/"
servicekey_path = KEY_PATH + "serviceKey.json" ## 빅쿼리 외 다른 API 활용 위해
bigquerykey_path = KEY_PATH + "mido-project-426906-31b49963ac97.json"

warnings.filterwarnings("ignore")

In [3]:
# BigQuery 클라이언트 생성 함수
def create_bigquery_client(key_path):
    credentials = service_account.Credentials.from_service_account_file(key_path)
    client = bigquery.Client(credentials=credentials, project=credentials.project_id)
    return client

In [4]:
def save_dataframe_to_bigquery(df, dataset_id, table_id, key_path):
    # BigQuery 클라이언트 객체 생성
    client = create_bigquery_client(key_path)

    # 테이블 레퍼런스 생성
    table_ref = client.dataset(dataset_id).table(table_id)

    # 데이터프레임을 BigQuery 테이블에 적재
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_TRUNCATE"  # 기존 테이블 내용 삭제 후 삽입

    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()  # 작업 완료 대기

    print(f"Data inserted into table {table_id} successfully.")

In [5]:
def get_dataframe_from_bigquery(dataset_id, table_id, key_path):
    # BigQuery 클라이언트 생성
    client = create_bigquery_client(key_path)

    # 테이블 레퍼런스 생성
    table_ref = client.dataset(dataset_id).table(table_id)

    # 테이블 데이터를 DataFrame으로 변환
    df = client.list_rows(table_ref).to_dataframe()

    return df

In [6]:
# 오늘 날짜
today = datetime.today().strftime('%Y%m%d')

# 어제 날짜 계산
ytday = datetime.today() - timedelta(days=1)

# 만약 어제가 토요일(5) 또는 일요일(6)이라면, 그 전주 금요일로 변경
if ytday.weekday() == 5:  # 토요일
    ytday -= timedelta(days=1)
elif ytday.weekday() == 6:  # 일요일
    ytday -= timedelta(days=2)

# 'YYYYMMDD' 형식으로 변환
ytday = ytday.strftime('%Y%m%d')

#### 종합쇼핑몰 납품상세내역

In [172]:
all_shop_df = get_dataframe_from_bigquery('g2b', 'shop_detail_df_all', bigquerykey_path)

In [173]:
all_shop_df_fin = all_shop_df[['납품요구접수일자', '수요기관명', '납품요구건명', '업체명', '단가', '단위', '수량', '금액', '수요기관코드', '수요기관구분', '수요기관지역명','납품요구지청명']]
all_shop_df_fin = all_shop_df_fin.sort_values(['납품요구접수일자'],ascending=False).reset_index(drop=True)

In [174]:
# 특수문자, 숫자, 영어 제거 함수 (연속된 공백을 단일 공백으로 변환 포함)
def clean_text(text):
    text = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
    text = re.sub('\s+', ' ', text).strip()  # 연속된 공백을 단일 공백으로 변환
    return text

In [175]:
# 사업명 전처리
all_shop_df_fin['납품요구건명_re'] = all_shop_df_fin['납품요구건명'].apply(clean_text)
all_shop_df_fin['납품요구건명_re'] = all_shop_df_fin['납품요구건명_re'].str.replace('초등학교','').str.replace('중학교','').str.replace('고등학교','')
all_shop_df_fin['납품요구건명_re'] = all_shop_df_fin['납품요구건명_re'].str.strip()

In [158]:
# 사업명 키워드 필터링
mapping_keywd_all = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' '))) ## 전체 키워드
mapping_keywd_all_filter_cnt = pd.Series(Counter(mapping_keywd_all))

mapping_keywd3 = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' ').str[:3])) ## split 3개 키워드
mapping_keywd3_filter_cnt = pd.Series(Counter(mapping_keywd3))

mapping_keywd2 = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' ').str[:2])) ## split 2개 키워드
mapping_keywd2_filter_cnt = pd.Series(Counter(mapping_keywd2))

mapping_keywd1 = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' ').str[:1])) ## split 1개 키워드
mapping_keywd1_filter_cnt = pd.Series(Counter(mapping_keywd1))

mapping_keywd_all_filter_cnt_nm = ('|').join(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt <= mapping_keywd_all_filter_cnt.mean()].keys())
mapping_keywd3_filter_cnt_nm = ('|').join(mapping_keywd3_filter_cnt[mapping_keywd3_filter_cnt <= mapping_keywd3_filter_cnt.mean()].keys())
mapping_keywd2_filter_cnt_nm = ('|').join(mapping_keywd2_filter_cnt[mapping_keywd2_filter_cnt <= mapping_keywd2_filter_cnt.mean()].keys())
mapping_keywd1_filter_cnt_nm = ('|').join(mapping_keywd1_filter_cnt[mapping_keywd1_filter_cnt <= mapping_keywd1_filter_cnt.mean()].keys())

In [159]:
# 수요기관 필터링
dist_nm = ('|').join(all_shop_df_fin[all_shop_df_fin['수요기관지역명'].str.split(' ').str[1].notnull()]['수요기관지역명'].str.split(' ').str[1])

In [198]:
# 전체 키워드 데이터셋
mapping_keywd_df = pd.DataFrame(Counter(mapping_keywd_all).items(),columns=['키워드','빈도수'])
mapping_keywd_df.sort_values('빈도수',ascending=False)

Unnamed: 0,키워드,빈도수
10,인조잔디,336
57,관급자재,217
19,구입,171
16,관급자재인조잔디,163
12,구매,100
...,...,...
597,작은섬공원,1
598,통,1
599,송정배수지공원,1
601,인제,1


In [160]:
# 종합쇼핑몰 기준 중요키워드 추출
print('1등급 키워드 : ', set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 50].keys())))
print('2등급 키워드 : ', set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 40].keys())) - set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 50].keys())))
print('3등급 키워드 : ', set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 30].keys())) - set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 40].keys())))
print('4등급 키워드 : ', set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 20].keys())) - set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 30].keys())))
print('5등급 키워드 : ', set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 10].keys())) - set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 20].keys())))
print('기타 키워드 : ', set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 1].keys())) - set(list(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt >= 10].keys())))

1등급 키워드 :  {'인조잔디', '관급자재', '게이트볼장', '관급자재인조잔디', '조성공사', '테니스장', '공사', '구매', '체육시설', '운동장', '및', '구입'}
2등급 키워드 :  set()
3등급 키워드 :  {'조성사업', '설치', '축구장', '정비공사', '관급인조잔디', '교체공사'}
4등급 키워드 :  {'구입인조잔디', '파크골프장', '개선공사', '족구장', '조성', '풋살장'}
5등급 키워드 :  {'정비사업', '조성공사인조잔디', '설치공사', '체육공원', '다목적', '교체공사인조잔디', '교체', '년', '환경개선', '등', '보수공사', '부대', '종', '다목적구장', '내', '관급', '환경개선공사', '정비공사인조잔디', '외', '연병장'}
기타 키워드 :  {'안성', '관급자재조달구매풍납족구장', '시민개방형체육시설조성사업인조잔디', '주변정비공사', '한내근린공원', '정비시행', '복구공사', '검단동', '노들나루공원', '포항제철', '활성화사업보조경기장', '율곡고', '물품책장', '생활체육시설', '부여종합운동장', '목포하당초', '죽도', '사회인야구장', '복지센터', '물놀이장', '도산리', '인조잔디배수판', '양주시', '생태공원', '확장공사에', '율면', '개체공사관급인조잔디', '환경개선인조잔디', '지천생태공원', '구매중평어린이공원인조잔디', '보수공사대보수', '발안바이오과학고', '정천면', '확장', '국립김천숲속야영장', '운동장환경개선', '부항면', '인조잔디그라운드골프장', '죽율체육공원', '남부사업소', '호', '조달요청삼사풋살장', '구매양오리', '잔여분', '뱃터공원', '동일공고', '운동부', '구입등구정', '사염공원', '건의이천', '노후인조잔디', '관리도로', '설치용', '미래관', '운동기구', '기장현대차드림볼파크', '원주삼육', '줄포자동차공업', '신안공설운동장', '입장', '관급자재토목인조잔디', '용호

#### 지자체 세부사업별 예산서

In [164]:
# 빅쿼리에서 불러오기
budget_df_today = get_dataframe_from_bigquery('budget','budget_df_0' + today,bigquerykey_path)
budget_df_ytday = get_dataframe_from_bigquery('budget','budget_df_0' + ytday,bigquerykey_path)

In [165]:
# 사업명 전처리
budget_df_today['세부사업명_re'] = budget_df_today['세부사업명'].apply(clean_text)
budget_df_today['세부사업명_re'] = budget_df_today['세부사업명_re'].str.replace('초등학교','').str.replace('중학교','').str.replace('고등학교','')
budget_df_today['세부사업명_re'] = budget_df_today['세부사업명_re'].str.strip()

budget_df_ytday['세부사업명_re'] = budget_df_ytday['세부사업명'].apply(clean_text)
budget_df_ytday['세부사업명_re'] = budget_df_ytday['세부사업명_re'].str.replace('초등학교','').str.replace('중학교','').str.replace('고등학교','')
budget_df_ytday['세부사업명_re'] = budget_df_ytday['세부사업명_re'].str.strip()

In [166]:
# 해당 지역 추출
filtered_budget_df_today = budget_df_today[budget_df_today['자치단체명'].str.contains(dist_nm)].reset_index(drop=True)
filtered_budget_df_ytday = budget_df_ytday[budget_df_ytday['자치단체명'].str.contains(dist_nm)].reset_index(drop=True)

In [167]:
# 전체 키워드 필터링
filtered_all_budget_df_today = filtered_budget_df_today[filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd_all_filter_cnt_nm)]
filtered_all_budget_df_ytday = filtered_budget_df_ytday[filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd_all_filter_cnt_nm)]

In [168]:
# split 3개 키워드 필터링
filtered3_budget_df_today = filtered_budget_df_today[filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd3_filter_cnt_nm)]
filtered3_budget_df_ytday = filtered_budget_df_ytday[filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd3_filter_cnt_nm)]

In [169]:
# split 2개 키워드 필터링
filtered2_budget_df_today = filtered_budget_df_today[filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd2_filter_cnt_nm)]
filtered2_budget_df_ytday = filtered_budget_df_ytday[filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd2_filter_cnt_nm)]

In [170]:
# split 1개 키워드 필터링
filtered1_budget_df_today = filtered_budget_df_today[~filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd1_filter_cnt_nm)]
filtered1_budget_df_ytday = filtered_budget_df_ytday[~filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd1_filter_cnt_nm)]

In [65]:
filtered_budget_df_today_fin = filtered3_budget_df_today[filtered3_budget_df_today['세부사업코드'].isin(filtered_all_budget_df_today['세부사업코드'])]
filtered_budget_df_today_fin = filtered2_budget_df_today[filtered2_budget_df_today['세부사업코드'].isin(filtered_budget_df_today_fin['세부사업코드'])]
filtered_budget_df_today_fin = filtered1_budget_df_today[filtered1_budget_df_today['세부사업코드'].isin(filtered_budget_df_today_fin['세부사업코드'])]

filtered_budget_df_ytday_fin = filtered3_budget_df_ytday[filtered3_budget_df_ytday['세부사업코드'].isin(filtered_all_budget_df_ytday['세부사업코드'])]
filtered_budget_df_ytday_fin = filtered2_budget_df_ytday[filtered2_budget_df_ytday['세부사업코드'].isin(filtered_budget_df_ytday_fin['세부사업코드'])]
filtered_budget_df_ytday_fin = filtered1_budget_df_ytday[filtered1_budget_df_ytday['세부사업코드'].isin(filtered_budget_df_ytday_fin['세부사업코드'])]

In [70]:
budget_df_today[~budget_df_today['세부사업코드'].isin(filtered_budget_df_today_fin['세부사업코드'])]

Unnamed: 0,회계연도,부서코드,지역코드,지역명,자치단체코드,자치단체명,회계구분명,세부사업코드,세부사업명,집행일자,예산현액,지출액,편성액,분야명,부문명,행정자치단체코드,세부사업명_re
0,2024,1089018,1100000,서울,1100000,서울본청,일반회계,61100002020301FA,非OECD국가 학교 지원 육성,20240701,97300000,56889000,97300000,산업ㆍ중소기업및에너지,산업ㆍ중소기업일반,6110000,국가 학교 지원 육성
1,2024,1485010,1100000,서울,1100000,서울본청,소방특별회계,611000020223029E,소방학교 사이버콘텐츠 개발 추진,20240701,130900000,88410000,130900000,공공질서및안전,소방,6110000,소방학교 사이버콘텐츠 개발 추진
2,2024,1079050,1100000,서울,1100000,서울본청,소방특별회계,61100002024300EC,소방학교 실화재 훈련장 건립,20240701,470500000,211500000,470500000,공공질서및안전,소방,6110000,소방학교 실화재 훈련장 건립
3,2024,1016100,1100000,서울,1100000,서울본청,학교용지부담금특별회계,6110000202030354,학교용지 매입비 부담,20240701,1987525000,0,1987525000,교육,유아및초중등교육,6110000,학교용지 매입비 부담
4,2024,1350020,1100000,서울,1100000,서울본청,일반회계,6110000200830744,한강공원 시민이용 홍보,20240701,1227083500,342506676,1217090000,환경,환경보호일반,6110000,한강공원 시민이용 홍보
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3787,2024,3005350,4900000,제주,4900000,제주본청,일반회계,6500000202430051,장기미집행 도시계획시설(평대공원) 조성사업,20240701,180000000,75600000,180000000,국토및지역개발,지역및도시,6500000,장기미집행 도시계획시설평대공원 조성사업
3788,2024,3005350,4900000,제주,4900000,제주본청,일반회계,6500000202430052,장기미집행 도시계획시설(두모공원) 조성사업,20240701,60000000,36101100,60000000,국토및지역개발,지역및도시,6500000,장기미집행 도시계획시설두모공원 조성사업
3789,2024,3005350,4900000,제주,4900000,제주본청,일반회계,6500000202430053,장기미집행 도시계획시설(고산공원) 조성사업,20240701,60000000,29500000,60000000,국토및지역개발,지역및도시,6500000,장기미집행 도시계획시설고산공원 조성사업
3790,2024,4005240,4900000,제주,4900000,제주본청,일반회계,6500000202430080,장기미집행 도시계획시설(새섬공원) 조성사업,20240701,200000000,182100670,200000000,국토및지역개발,지역및도시,6500000,장기미집행 도시계획시설새섬공원 조성사업


In [None]:
# 빅쿼리에서 불러오기
budget_df_today = get_dataframe_from_bigquery('budget','budget_df_0' + today,bigquerykey_path)
budget_df_ytday = get_dataframe_from_bigquery('budget','budget_df_0' + ytday,bigquerykey_path)

In [None]:
# 종료 사업
budget_df_delete = budget_df_ytday[budget_df_ytday['세부사업코드'].isin(list(set(budget_df_ytday['세부사업코드']) - set(budget_df_today['세부사업코드'])))].reset_index(drop=True)

In [None]:
# 새로 추가된 사업
budget_df_new = budget_df_today[~budget_df_today['세부사업코드'].isin(budget_df_ytday['세부사업코드'])].reset_index(drop=True).reset_index(drop=True)

In [None]:
# 빅쿼리에 적재
save_dataframe_to_bigquery(budget_df_delete,'budget','budget_df_delete',bigquerykey_path)
save_dataframe_to_bigquery(budget_df_new,'budget','budget_df_new',bigquerykey_path)

Data inserted into table budget_df_delete successfully.
Data inserted into table budget_df_new successfully.
