In [1]:
import pandas as pd
import numpy as np

import re
from itertools import chain
from collections import Counter

from google.cloud import bigquery
from google.oauth2 import service_account

from datetime import datetime, timedelta

import requests
import warnings
from requests.packages.urllib3.exceptions import InsecureRequestWarning

# InsecureRequestWarning 경고 무시
warnings.simplefilter('ignore', InsecureRequestWarning)

In [2]:
KEY_PATH = ".config/"
servicekey_path = KEY_PATH + "serviceKey.json" ## 빅쿼리 외 다른 API 활용 위해
bigquerykey_path = KEY_PATH + "mido-project-426906-31b49963ac97.json"

warnings.filterwarnings("ignore")

In [3]:
# BigQuery 클라이언트 생성 함수
def create_bigquery_client(key_path):
    credentials = service_account.Credentials.from_service_account_file(key_path)
    client = bigquery.Client(credentials=credentials, project=credentials.project_id)
    return client

In [4]:
def save_dataframe_to_bigquery(df, dataset_id, table_id, key_path):
    # BigQuery 클라이언트 객체 생성
    client = create_bigquery_client(key_path)

    # 테이블 레퍼런스 생성
    table_ref = client.dataset(dataset_id).table(table_id)

    # 데이터프레임을 BigQuery 테이블에 적재
    job_config = bigquery.LoadJobConfig()
    job_config.write_disposition = "WRITE_TRUNCATE"  # 기존 테이블 내용 삭제 후 삽입

    job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
    job.result()  # 작업 완료 대기

    print(f"Data inserted into table {table_id} successfully.")

In [5]:
def get_dataframe_from_bigquery(dataset_id, table_id, key_path):
    # BigQuery 클라이언트 생성
    client = create_bigquery_client(key_path)

    # 테이블 레퍼런스 생성
    table_ref = client.dataset(dataset_id).table(table_id)

    # 테이블 데이터를 DataFrame으로 변환
    df = client.list_rows(table_ref).to_dataframe()

    return df

In [6]:
# 오늘 날짜
today = datetime.today().strftime('%Y%m%d')

# 어제 날짜 계산
ytday = datetime.today() - timedelta(days=1)

# 만약 어제가 토요일(5) 또는 일요일(6)이라면, 그 전주 금요일로 변경
if ytday.weekday() == 5:  # 토요일
    ytday -= timedelta(days=1)
elif ytday.weekday() == 6:  # 일요일
    ytday -= timedelta(days=2)

# 'YYYYMMDD' 형식으로 변환
ytday = ytday.strftime('%Y%m%d')

#### 종합쇼핑몰 납품상세내역

In [7]:
all_shop_df = get_dataframe_from_bigquery('g2b', 'shop_detail_df_all', bigquerykey_path)

In [8]:
all_shop_df_fin = all_shop_df[['납품요구접수일자', '수요기관명', '납품요구건명', '업체명', '단가', '단위', '수량', '금액', '수요기관코드', '수요기관구분', '수요기관지역명','납품요구지청명']]
all_shop_df_fin = all_shop_df_fin.sort_values(['납품요구접수일자'],ascending=False).reset_index(drop=True)

In [9]:
# 특수문자, 숫자, 영어 제거 함수 (연속된 공백을 단일 공백으로 변환 포함)
def clean_text(text):
    text = re.sub('[^ㄱ-ㅎㅏ-ㅣ가-힣\s]', '', text)
    text = re.sub('\s+', ' ', text).strip()  # 연속된 공백을 단일 공백으로 변환
    return text

In [10]:
# 사업명 전처리
all_shop_df_fin['납품요구건명_re'] = all_shop_df_fin['납품요구건명'].apply(clean_text)
all_shop_df_fin['납품요구건명_re'] = all_shop_df_fin['납품요구건명_re'].str.replace('초등학교','').str.replace('중학교','').str.replace('고등학교','')
all_shop_df_fin['납품요구건명_re'] = all_shop_df_fin['납품요구건명_re'].str.strip()

In [11]:
# 수요기관 필터링
dist_nm = ('|').join(all_shop_df_fin[all_shop_df_fin['수요기관지역명'].str.split(' ').str[1].notnull()]['수요기관지역명'].str.split(' ').str[1])

In [12]:
# 사업명 split 키워드 필터링
mapping_keywd_all = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' '))) ## 전체 키워드
mapping_keywd_all_filter_cnt = pd.Series(Counter(mapping_keywd_all))

mapping_keywd3 = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' ').str[:3])) ## split 3개 키워드
mapping_keywd3_filter_cnt = pd.Series(Counter(mapping_keywd3))

mapping_keywd2 = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' ').str[:2])) ## split 2개 키워드
mapping_keywd2_filter_cnt = pd.Series(Counter(mapping_keywd2))

mapping_keywd1 = list(chain(*all_shop_df_fin['납품요구건명_re'].str.split(' ').str[:1])) ## split 1개 키워드
mapping_keywd1_filter_cnt = pd.Series(Counter(mapping_keywd1))

mapping_keywd_all_filter_cnt_nm = ('|').join(mapping_keywd_all_filter_cnt[mapping_keywd_all_filter_cnt <= mapping_keywd_all_filter_cnt.mean()].keys())
mapping_keywd3_filter_cnt_nm = ('|').join(mapping_keywd3_filter_cnt[mapping_keywd3_filter_cnt <= mapping_keywd3_filter_cnt.mean()].keys())
mapping_keywd2_filter_cnt_nm = ('|').join(mapping_keywd2_filter_cnt[mapping_keywd2_filter_cnt <= mapping_keywd2_filter_cnt.mean()].keys())
mapping_keywd1_filter_cnt_nm = ('|').join(mapping_keywd1_filter_cnt[mapping_keywd1_filter_cnt <= mapping_keywd1_filter_cnt.mean()].keys())

In [13]:
# 전체 키워드 데이터셋
mapping_keywd_df = pd.DataFrame(Counter(mapping_keywd_all).items(),columns=['키워드','빈도수'])
mapping_keywd_df = mapping_keywd_df[mapping_keywd_df['키워드'].apply(len)!=1].reset_index(drop=True)
mapping_keywd_df.sort_values('빈도수',ascending=False)

Unnamed: 0,키워드,빈도수
9,인조잔디,336
56,관급자재,217
18,구입,171
15,관급자재인조잔디,163
11,구매,100
...,...,...
581,건립공사인조잔디,1
582,구입이천시설봉공원히딩크드림필드축구장개보수공사,1
583,효암,1
586,작은섬공원,1


In [72]:
# 종합쇼핑몰 기준 중요키워드 추출
print('1등급 키워드 : ', list(mapping_keywd_df[mapping_keywd_df['빈도수'] >= 50].sort_values('빈도수',ascending=False)['키워드']))
print('2등급 키워드 : ', list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 50) & (mapping_keywd_df['빈도수'] >= 40)].sort_values('빈도수',ascending=False)['키워드']))
print('3등급 키워드 : ', list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 40) & (mapping_keywd_df['빈도수'] >= 30)].sort_values('빈도수',ascending=False)['키워드']))
print('4등급 키워드 : ', list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 30) & (mapping_keywd_df['빈도수'] >= 20)].sort_values('빈도수',ascending=False)['키워드']))
print('5등급 키워드 : ', list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 20) & (mapping_keywd_df['빈도수'] >= 10)].sort_values('빈도수',ascending=False)['키워드']))
print('기타 키워드 : ', list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 10)].sort_values('빈도수',ascending=False)['키워드']))

1등급 키워드 :  ['인조잔디', '관급자재', '구입', '관급자재인조잔디', '구매', '운동장', '테니스장', '조성공사', '게이트볼장', '공사', '체육시설']
2등급 키워드 :  []
3등급 키워드 :  ['정비공사', '조성사업', '교체공사', '설치', '관급인조잔디', '축구장']
4등급 키워드 :  ['풋살장', '개선공사', '조성', '족구장', '구입인조잔디', '파크골프장']
5등급 키워드 :  ['환경개선공사', '다목적구장', '정비사업', '조성공사인조잔디', '보수공사', '관급', '교체공사인조잔디', '교체', '설치공사', '환경개선', '연병장', '다목적', '부대', '정비공사인조잔디', '체육공원']
기타 키워드 :  ['개보수공사', '시설', '개보수', '조달', '시설개선사업', '신축', '생활체육공원', '개선', '조달구매', '교체사업', '생활체육시설', '정비', '노후시설', '어린이공원', '시설개선공사', '시설개선', '추가', '개소', '따른', '사업', '관련', '관급자재조경인조잔디', '실내게이트볼장', '김영훈', '요청', '리모델링', '증설공사', '체육', '트랙조성', '공공하수처리시설', '당진', '위한', '인조잔디운동장', '설치공사인조잔디', '여단', '비가림시설', '공사인조잔디', '그라운드골프장', '일원', '개선사업', '관급자재인조잔디구입', '학교운동장', '보수', '조성사업인조잔디', '배드민턴장', '물품', '구매인조잔디', '본부', '시행', '하부', '조달구입', '근린공원', '야구장', '구매설치', '교체관급인조잔디', '운동공간', '재조성', '환경개선인조잔디', '트랙보수', '사단', '잔디', '수해복구공사', '본원', '시설물', '옥외', '스포츠파크', '수지아르피아', '확장', '소계체육공원', '토목', '보수인조잔디', '설치인조잔디', '울타리', '사령부', '광복공원', '두호', '회전교차로

In [73]:
degree_kwd = pd.DataFrame([list(mapping_keywd_df[mapping_keywd_df['빈도수'] >= 50].sort_values('빈도수',ascending=False)['키워드']),
                           list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 50) & (mapping_keywd_df['빈도수'] >= 40)].sort_values('빈도수',ascending=False)['키워드']),
                           list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 40) & (mapping_keywd_df['빈도수'] >= 30)].sort_values('빈도수',ascending=False)['키워드']),
                           list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 30) & (mapping_keywd_df['빈도수'] >= 20)].sort_values('빈도수',ascending=False)['키워드']),
                           list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 20) & (mapping_keywd_df['빈도수'] >= 10)].sort_values('빈도수',ascending=False)['키워드']),
                           list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 10)].sort_values('빈도수',ascending=False)['키워드'])]).T
degree_kwd.columns = ['1등급','2등급','3등급','4등급','5등급','기타']

In [15]:
# 빈도수 키워드
freq_kwd = ('|').join(list(mapping_keywd_df[(mapping_keywd_df['빈도수'] < 10)]['키워드']))

#### 지자체 세부사업별 예산서

In [101]:
# 빅쿼리에서 불러오기
budget_df_today = get_dataframe_from_bigquery('budget','budget_df_0' + today,bigquerykey_path)
budget_df_ytday = get_dataframe_from_bigquery('budget','budget_df_0' + ytday,bigquerykey_path)

In [122]:
# 종료 사업
budget_df_delete_temp1 = budget_df_ytday[budget_df_ytday['세부사업코드'].isin(set(budget_df_ytday['세부사업코드']) - set(budget_df_today['세부사업코드']))].reset_index(drop=True)
budget_df_delete_temp2 = budget_df_ytday[budget_df_ytday['세부사업명'].isin(set(budget_df_ytday['세부사업명']) - set(budget_df_today['세부사업명']))].reset_index(drop=True)
budget_df_delete = pd.concat([budget_df_delete_temp1,budget_df_delete_temp2],axis=0).reset_index(drop=True)
budget_df_delete

Unnamed: 0,회계연도,지역코드,지역명,자치단체코드,자치단체명,회계구분명,세부사업코드,세부사업명,집행일자,예산현액,지출액,편성액,분야명,부문명,행정자치단체코드,세부사업명_re


In [123]:
# 새로 추가된 사업
budget_df_new_temp1 = budget_df_today[~budget_df_today['세부사업코드'].isin(budget_df_ytday['세부사업코드'])].reset_index(drop=True).reset_index(drop=True)
budget_df_new_temp2 = budget_df_today[~budget_df_today['세부사업명'].isin(budget_df_ytday['세부사업명'])].reset_index(drop=True).reset_index(drop=True)
budget_df_new = pd.concat([budget_df_new_temp1,budget_df_new_temp2],axis=0).reset_index(drop=True)
budget_df_new

Unnamed: 0,회계연도,지역코드,지역명,자치단체코드,자치단체명,회계구분명,세부사업코드,세부사업명,집행일자,예산현액,지출액,편성액,분야명,부문명,행정자치단체코드,세부사업명_re


In [105]:
# 사업명 전처리
budget_df_today['세부사업명_re'] = budget_df_today['세부사업명'].apply(clean_text)
budget_df_today['세부사업명_re'] = budget_df_today['세부사업명_re'].str.replace('초등학교','').str.replace('중학교','').str.replace('고등학교','')
budget_df_today['세부사업명_re'] = budget_df_today['세부사업명_re'].str.strip()

# budget_df_ytday['세부사업명_re'] = budget_df_ytday['세부사업명'].apply(clean_text)
# budget_df_ytday['세부사업명_re'] = budget_df_ytday['세부사업명_re'].str.replace('초등학교','').str.replace('중학교','').str.replace('고등학교','')
# budget_df_ytday['세부사업명_re'] = budget_df_ytday['세부사업명_re'].str.strip()

In [106]:
# 해당 지역 추출
filtered_budget_df_today = budget_df_today[budget_df_today['자치단체명'].str.contains(dist_nm)].reset_index(drop=True)
# filtered_budget_df_ytday = budget_df_ytday[budget_df_ytday['자치단체명'].str.contains(dist_nm)].reset_index(drop=True)

In [107]:
# split 키워드 필터링
filtered_budget_df_today = filtered_budget_df_today[filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd_all_filter_cnt_nm)] ## 전체 키워드
filtered_budget_df_today = filtered_budget_df_today[filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd3_filter_cnt_nm)] ## split 3개 키워드
filtered_budget_df_today = filtered_budget_df_today[filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd2_filter_cnt_nm)] ## split 2개 키워드
filtered_budget_df_today = filtered_budget_df_today[filtered_budget_df_today['세부사업명_re'].str.contains(mapping_keywd1_filter_cnt_nm)] ## split 1개 키워드

# filtered_budget_df_ytday = filtered_budget_df_ytday[filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd_all_filter_cnt_nm)] ## 전체 키워드
# filtered_budget_df_ytday = filtered_budget_df_ytday[filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd3_filter_cnt_nm)] ## split 3개 키워드
# filtered_budget_df_ytday = filtered_budget_df_ytday[filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd2_filter_cnt_nm)] ## split 2개 키워드
# filtered_budget_df_ytday = filtered_budget_df_ytday[filtered_budget_df_ytday['세부사업명_re'].str.contains(mapping_keywd1_filter_cnt_nm)] ## split 1개 키워드

In [108]:
# 종합쇼핑몰 기준 필터링
bid_cd_split_today = list(budget_df_today[budget_df_today['세부사업코드'].isin(filtered_budget_df_today['세부사업코드'])]['세부사업코드']) # split 키워드 필터링
bid_cd_freq_today = list(budget_df_today[budget_df_today['세부사업명'].str.contains(freq_kwd)]['세부사업코드']) # 빈도수 키워드 필터링

# bid_cd_split_ytday = list(budget_df_ytday[budget_df_ytday['세부사업코드'].isin(filtered_budget_df_ytday['세부사업코드'])]['세부사업코드']) # split 키워드 필터링
# bid_cd_freq_ytday = list(budget_df_ytday[budget_df_ytday['세부사업명'].str.contains(freq_kwd)]['세부사업코드']) # 빈도수 키워드 필터링

In [109]:
# 종합쇼핑몰 진행건 제외 & 지출액0(미진행건) 추가
budget_df_today_temp1 = budget_df_today[~budget_df_today['세부사업코드'].isin(set(bid_cd_split_today + bid_cd_freq_today))]
budget_df_today_temp2 = budget_df_today[budget_df_today['세부사업코드'].isin(set(bid_cd_split_today + bid_cd_freq_today))]
budget_df_today_temp2 = budget_df_today_temp2[budget_df_today_temp2['지출액']==0].reset_index(drop=True)
budget_df_today_fin = pd.concat([budget_df_today_temp1, budget_df_today_temp2], axis=0).reset_index(drop=True)

# budget_df_ytday_temp1 = budget_df_ytday[~budget_df_ytday['세부사업코드'].isin(set(bid_cd_split_ytday + bid_cd_freq_ytday))]
# budget_df_ytday_temp2 = budget_df_ytday[budget_df_ytday['세부사업코드'].isin(set(bid_cd_split_ytday + bid_cd_freq_ytday))]
# budget_df_ytday_temp2 = budget_df_ytday_temp2[budget_df_ytday_temp2['지출액']==0].reset_index(drop=True)
# budget_df_ytday_fin = pd.concat([budget_df_ytday_temp1, budget_df_ytday_temp2], axis=0).reset_index(drop=True)

In [110]:
# 중요 사업 체크
budget_df_today_final = pd.concat([budget_df_today_fin[budget_df_today_fin['세부사업명'].str.contains('인조잔디')],
                                   budget_df_today_fin[~budget_df_today_fin['세부사업명'].str.contains('인조잔디')]],axis=0).reset_index(drop=True)

# budget_df_ytday_final = pd.concat([budget_df_ytday_fin[budget_df_ytday_fin['세부사업명'].str.contains('인조잔디')],
#                                    budget_df_ytday_fin[~budget_df_ytday_fin['세부사업명'].str.contains('인조잔디')]],axis=0).reset_index(drop=True)

In [128]:
# 최종데이터셋
budget_df_today_final = budget_df_today_final[['회계연도', '집행일자', '지역명', '자치단체명', '회계구분명', '세부사업명','예산현액', '지출액', '편성액', '분야명', '부문명']]
budget_df_delete = budget_df_delete[['회계연도', '집행일자', '지역명', '자치단체명', '회계구분명', '세부사업명','예산현액', '지출액', '편성액', '분야명', '부문명']]
budget_df_new = budget_df_new[['회계연도', '집행일자', '지역명', '자치단체명', '회계구분명', '세부사업명','예산현액', '지출액', '편성액', '분야명', '부문명']]

In [129]:
# 빅쿼리에 적재
save_dataframe_to_bigquery(budget_df_today_final,'budget','budget_df_listup',bigquerykey_path)
save_dataframe_to_bigquery(budget_df_delete,'budget','budget_df_delete',bigquerykey_path)
save_dataframe_to_bigquery(budget_df_new,'budget','budget_df_new',bigquerykey_path)

Data inserted into table budget_df_listup successfully.
