# [국가, 품목, 년도]별로 수출금액, GDP 병합 및 선호도 파생변수 생성

In [37]:
import os
import pickle
import numpy as np
import pandas as pd
from IPython.display import display
from tqdm import tqdm
tqdm.pandas()

import matplotlib.pyplot as plt
import seaborn as sns
plt.rc('font', family='AppleGothic')
plt.rc('axes', unicode_minus=False)

import warnings
warnings.filterwarnings('ignore')

In [38]:
# GDP, 수출입실적, 품목명-품목코드 매핑 데이터 불러오기
GDP = pd.read_csv('../data/GDP_ori.csv')

with open('../processed/performance_2012-2023.pkl', 'rb') as f:
    performance = pickle.load(f)

In [39]:
GDP.shape, performance.shape

((266, 67), (137441, 9))

In [40]:
display(GDP.head())
display(performance.head())

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2727933000.0,2791061000.0,2963128000.0,2983799000.0,3092179000.0,3202235000.0,3368970000.0,2610039000.0,3126019000.0,
1,Africa Eastern and Southern,AFE,GDP (current US$),NY.GDP.MKTP.CD,21291520000.0,21809440000.0,23708060000.0,28211280000.0,26119940000.0,29683480000.0,...,983000000000.0,1000000000000.0,923000000000.0,890000000000.0,1030000000000.0,1020000000000.0,1010000000000.0,934000000000.0,1090000000000.0,
2,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,...,20564490000.0,20550580000.0,19998160000.0,18019560000.0,18896350000.0,18418850000.0,18904490000.0,20143440000.0,14786860000.0,
3,Africa Western and Central,AFW,GDP (current US$),NY.GDP.MKTP.CD,10404140000.0,11127890000.0,11943190000.0,12676330000.0,13838370000.0,14862230000.0,...,832000000000.0,892000000000.0,767000000000.0,691000000000.0,684000000000.0,766000000000.0,795000000000.0,785000000000.0,840000000000.0,
4,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,133000000000.0,137000000000.0,87219300000.0,49840490000.0,68972770000.0,77792940000.0,69309110000.0,53619070000.0,67404290000.0,


Unnamed: 0,기간,품목명,품목코드,국가명,수출중량,수입중량,수출금액,수입금액,무역수지
1,2012,살아 있는 동물,1.0,가나,0.0,0.0,0.0,3.0,-3.0
3,2012,육과 식용 설육(屑肉),2.0,가나,1.5,0.0,30.0,0.0,30.0
6,2012,어류ㆍ갑각류ㆍ연체동물과 그 밖의 수생(水生) 무척추동물,3.0,가나,2449.7,1775.9,2334.0,6202.0,-3868.0
10,2012,다른 류로 분류되지 않은 동물성 생산품,5.0,가나,0.0,0.0,0.0,9.0,-9.0
16,2012,커피ㆍ차ㆍ마테(maté)ㆍ향신료,9.0,가나,0.1,0.0,11.0,0.0,11.0


### GDP 2012~2021 데이터 추출 

In [41]:
display(GDP.head())
print(GDP.columns)

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,Aruba,ABW,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,2727933000.0,2791061000.0,2963128000.0,2983799000.0,3092179000.0,3202235000.0,3368970000.0,2610039000.0,3126019000.0,
1,Africa Eastern and Southern,AFE,GDP (current US$),NY.GDP.MKTP.CD,21291520000.0,21809440000.0,23708060000.0,28211280000.0,26119940000.0,29683480000.0,...,983000000000.0,1000000000000.0,923000000000.0,890000000000.0,1030000000000.0,1020000000000.0,1010000000000.0,934000000000.0,1090000000000.0,
2,Afghanistan,AFG,GDP (current US$),NY.GDP.MKTP.CD,537777800.0,548888900.0,546666700.0,751111200.0,800000000.0,1006667000.0,...,20564490000.0,20550580000.0,19998160000.0,18019560000.0,18896350000.0,18418850000.0,18904490000.0,20143440000.0,14786860000.0,
3,Africa Western and Central,AFW,GDP (current US$),NY.GDP.MKTP.CD,10404140000.0,11127890000.0,11943190000.0,12676330000.0,13838370000.0,14862230000.0,...,832000000000.0,892000000000.0,767000000000.0,691000000000.0,684000000000.0,766000000000.0,795000000000.0,785000000000.0,840000000000.0,
4,Angola,AGO,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,133000000000.0,137000000000.0,87219300000.0,49840490000.0,68972770000.0,77792940000.0,69309110000.0,53619070000.0,67404290000.0,


Index(['Country Name', 'Country Code', 'Indicator Name', 'Indicator Code',
       '1960', '1961', '1962', '1963', '1964', '1965', '1966', '1967', '1968',
       '1969', '1970', '1971', '1972', '1973', '1974', '1975', '1976', '1977',
       '1978', '1979', '1980', '1981', '1982', '1983', '1984', '1985', '1986',
       '1987', '1988', '1989', '1990', '1991', '1992', '1993', '1994', '1995',
       '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004',
       '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013',
       '2014', '2015', '2016', '2017', '2018', '2019', '2020', '2021', '2022'],
      dtype='object')


In [42]:
# GDP - 2012년~2021년 데이터만 추출
GDP_12_21 = pd.DataFrame()
year_list = [i for i in range(2012, 2023)]
for year in year_list:
    GDP_temp = GDP[['Country Name', str(year), str(year-1)]]
    GDP_temp['year'] = year
    GDP_temp.rename(columns={'Country Name':'country', str(year):'GDP'}, inplace=True)
    GDP_temp = GDP_temp[['country', 'year', 'GDP']]
    GDP_12_21 = pd.concat([GDP_12_21, GDP_temp])
    
print('국가 수:', GDP_12_21.country.nunique())
display(GDP_12_21) 

국가 수: 266


Unnamed: 0,country,year,GDP
0,Aruba,2012,2.615084e+09
1,Africa Eastern and Southern,2012,9.720000e+11
2,Afghanistan,2012,2.020357e+10
3,Africa Western and Central,2012,7.360000e+11
4,Angola,2012,1.250000e+11
...,...,...,...
261,Kosovo,2022,
262,Yemen,2022,
263,South Africa,2022,
264,Zambia,2022,


In [43]:
# 결측치 개수 확인
GDP_12_21.isna().sum()

country      0
year         0
GDP        368
dtype: int64

In [44]:
# GDP 결측치 확인
display(GDP_12_21[GDP_12_21.GDP.isna()])

Unnamed: 0,country,year,GDP
38,Channel Islands,2012,
69,Eritrea,2012,
84,Gibraltar,2012,
110,Not classified,2012,
147,St. Martin (French part),2012,
...,...,...,...
261,Kosovo,2022,
262,Yemen,2022,
263,South Africa,2022,
264,Zambia,2022,


In [45]:
display(GDP[GDP['Country Name']=="Not classified"]) # GDP 원본에도 없음

Unnamed: 0,Country Name,Country Code,Indicator Name,Indicator Code,1960,1961,1962,1963,1964,1965,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
110,Not classified,INX,GDP (current US$),NY.GDP.MKTP.CD,,,,,,,...,,,,,,,,,,


In [46]:
# 'Not classified' 제거
GDP_12_21 = GDP_12_21[~(GDP_12_21['country']=="Not classified")] 

display(GDP_12_21.isna().sum())
print('데이터 수:', GDP_12_21.shape[0])
print('국가 수:', GDP_12_21.country.nunique())
display(GDP_12_21.head())

country      0
year         0
GDP        357
dtype: int64

데이터 수: 2915
국가 수: 265


Unnamed: 0,country,year,GDP
0,Aruba,2012,2615084000.0
1,Africa Eastern and Southern,2012,972000000000.0
2,Afghanistan,2012,20203570000.0
3,Africa Western and Central,2012,736000000000.0
4,Angola,2012,125000000000.0


### GDP_12_21, performance 데이터프레임에 국가명 국문, 영문 혼용 => 영문으로 통합하기

In [47]:
performance.국가명.nunique(), GDP_12_21.country.nunique() 

(253, 265)

In [48]:
country_name = pd.read_excel('../data/country_new.xlsx').drop('Unnamed: 0', axis=1)
country_name

Unnamed: 0,country_kr,country_en
0,세이쉘,Seychelles
1,세르비아,Serbia
2,세인트 루시아,Saint Lucia
3,세인트 키츠 네비스,Saint Kitts and Nevis
4,세인트 빈센트 그레나딘,Saint Vincent and the Grenadines
...,...,...
305,뉴칼레도니아,New Caledonia
306,북마리아나제도,Northern Mariana Islands
307,파푸아뉴기니,Papua New Guinea
308,솔로몬제도,Solomon Islands


In [49]:
cntry_kr_en = country_name.set_index('country_kr')['country_en'].to_dict()

In [50]:
performance['country'] = performance['국가명'].map(cntry_kr_en)
performance.head()

Unnamed: 0,기간,품목명,품목코드,국가명,수출중량,수입중량,수출금액,수입금액,무역수지,country
1,2012,살아 있는 동물,1.0,가나,0.0,0.0,0.0,3.0,-3.0,Ghana
3,2012,육과 식용 설육(屑肉),2.0,가나,1.5,0.0,30.0,0.0,30.0,Ghana
6,2012,어류ㆍ갑각류ㆍ연체동물과 그 밖의 수생(水生) 무척추동물,3.0,가나,2449.7,1775.9,2334.0,6202.0,-3868.0,Ghana
10,2012,다른 류로 분류되지 않은 동물성 생산품,5.0,가나,0.0,0.0,0.0,9.0,-9.0,Ghana
16,2012,커피ㆍ차ㆍ마테(maté)ㆍ향신료,9.0,가나,0.1,0.0,11.0,0.0,11.0,Ghana


In [51]:
performance.isna().sum()

기간         0
품목명        0
품목코드       0
국가명        0
수출중량       0
수입중량       0
수출금액       0
수입금액       0
무역수지       0
country    0
dtype: int64

### performance와 GDP_12_21 병합 및 선호도 계산
1. 원본 GDP에 없는 GDP 채우기 (GDP 찾아도 안나오고 수출액도 0인 경우 삭제)

2. 금액 단위 통일 필요
- performance '수출금액' 단위 : (USD 1,000)
- GDP_12_21 'GDP' 단위 : USD 

3. 선호도 = (특정 품목을 해당 국가에 수출한 금액) / (해당 국가의 GDP)

In [52]:
display(performance.head(2))
display(GDP_12_21.head(2))

Unnamed: 0,기간,품목명,품목코드,국가명,수출중량,수입중량,수출금액,수입금액,무역수지,country
1,2012,살아 있는 동물,1.0,가나,0.0,0.0,0.0,3.0,-3.0,Ghana
3,2012,육과 식용 설육(屑肉),2.0,가나,1.5,0.0,30.0,0.0,30.0,Ghana


Unnamed: 0,country,year,GDP
0,Aruba,2012,2615084000.0
1,Africa Eastern and Southern,2012,972000000000.0


In [53]:
# performance에서 수입 제외하고 수출금액만 추출
performance.rename(columns={'기간':'year'}, inplace=True)
export = performance[['year','country','품목코드','수출금액']].copy()
export.head(3)

Unnamed: 0,year,country,품목코드,수출금액
1,2012,Ghana,1.0,0.0
3,2012,Ghana,2.0,30.0
6,2012,Ghana,3.0,2334.0


In [54]:
export.shape, export.drop_duplicates().shape

((137441, 4), (134823, 4))

In [55]:
# export와 GDP_12_21 합치기
export = export.drop_duplicates()
export_GDP = pd.merge(export, GDP_12_21, how='left', on=['country','year'])

print(export.shape, export_GDP.shape)
display(export_GDP.head())

(134823, 4) (134823, 5)


Unnamed: 0,year,country,품목코드,수출금액,GDP
0,2012,Ghana,1.0,0.0,41270950000.0
1,2012,Ghana,2.0,30.0,41270950000.0
2,2012,Ghana,3.0,2334.0,41270950000.0
3,2012,Ghana,5.0,0.0,41270950000.0
4,2012,Ghana,9.0,11.0,41270950000.0


In [56]:
export_GDP.shape, export_GDP.drop_duplicates().shape

((134823, 5), (134823, 5))

In [57]:
export_GDP.isna().sum()

year           0
country        0
품목코드           0
수출금액           0
GDP        18522
dtype: int64

In [58]:
# GDP 채울 데이터 불러오기
fill_GDP = pd.read_excel('../data/gdp채우기.xlsx')
print(fill_GDP.shape)
display(fill_GDP.head())

# 연도별 국가별로 GDP 채우기
for i in range(len(fill_GDP)):
    row = (export_GDP['country']==fill_GDP['country'][i])&(export_GDP['year']==fill_GDP['year'][i]) 
    export_GDP.loc[row, 'GDP'] = fill_GDP['GDP'][i]
export_GDP

(44, 4)


Unnamed: 0,year,country,GDP 크롤링,GDP
0,2012,Somalia,1.306 billion,1306000000000.0
1,2012,Guiana,40.63억,4063000000.0
2,2012,Taiwan,"495,610 million",495610000000.0
3,2013,Guiana,41.68억,4168000000.0
4,2013,Taiwan,"512,943 million",512943000000.0


Unnamed: 0,year,country,품목코드,수출금액,GDP
0,2012,Ghana,1.0,0.0,4.127095e+10
1,2012,Ghana,2.0,30.0,4.127095e+10
2,2012,Ghana,3.0,2334.0,4.127095e+10
3,2012,Ghana,5.0,0.0,4.127095e+10
4,2012,Ghana,9.0,11.0,4.127095e+10
...,...,...,...,...,...
134818,2022,Hong Kong,94.0,10403.0,
134819,2022,Hong Kong,95.0,25709.0,
134820,2022,Hong Kong,96.0,9432.0,
134821,2022,Hong Kong,97.0,42265.0,


In [59]:
export_GDP.isna().sum()

year           0
country        0
품목코드           0
수출금액           0
GDP        16576
dtype: int64

In [60]:
# GDP 찾아도 없는 국가 삭제 (수출액도 0인 경우가 많음)
drop_country = ['Saint Helena', 'Saint Martin', 'British Virgin Islands', 'British Indian Ocean Territory', 'Anguilla',
'Heard Island and McDonald Islands', 'Dutch Antilles', 'South Georgia and the South Sandwich Islands',
'Norfolk Island','Antarctica','Vatican City','Gunji','International Monetary Fund (IMF)',
'Falkland Islands','Gibraltar','Kos Island','Mayotte','Tokelau','Christmas Island','Other countries',
'Martinique','Cook Islands','Svalbard and Jan Mayen','Western Sahara','Åland Islands',
'Wallis and Futuna','Pitcairn Islands','Niue','Jersey','Minor Outlying Islands of the United States','Montserrat',
'State of Palestine','Pitcairn Islands','Eritrea','Saint Barthélemy','Bouvet Island','Saint Martin (French part)','Isle of Man',
'Bonaire, Sint Eustatius and Saba','French Southern Territories','Northern Mariana Islands','Syria','Saint Pierre and Miquelon',
'Overseas territories','Virgin Islands','Guadeloupe','Reunion','Zaire']

print('삭제 전:', export_GDP.shape)
export_GDP = export_GDP[~export_GDP['country'].isin(drop_country)]
print('삭제 후:', export_GDP.shape)

삭제 전: (134823, 5)
삭제 후: (129308, 5)


In [61]:
export_GDP.isna().sum()

year           0
country        0
품목코드           0
수출금액           0
GDP        12093
dtype: int64

In [62]:
# 결측치가 있는 년도 확인 
export_GDP[export_GDP['GDP'].isna()]['year'].unique()  # 2022년은 GDP가 없을 수밖에 없음

array([2022])

In [63]:
# GDP_12_22의 GDP 단위 (USD 1,000)으로 맞추기
export_GDP['GDP'] = export_GDP['GDP'] / 1000
display(export_GDP.head(2))

Unnamed: 0,year,country,품목코드,수출금액,GDP
0,2012,Ghana,1.0,0.0,41270950.0
1,2012,Ghana,2.0,30.0,41270950.0


In [64]:
# 선호도 계산
# 선호도 : (특정 품목을 해당 국가에 수출한 금액) / (해당 국가의 GDP)
export_GDP['preference'] = export_GDP['수출금액'] / export_GDP['GDP']
export_GDP.head()

Unnamed: 0,year,country,품목코드,수출금액,GDP,preference
0,2012,Ghana,1.0,0.0,41270950.0,0.0
1,2012,Ghana,2.0,30.0,41270950.0,7.269035e-07
2,2012,Ghana,3.0,2334.0,41270950.0,5.655309e-05
3,2012,Ghana,5.0,0.0,41270950.0,0.0
4,2012,Ghana,9.0,11.0,41270950.0,2.665313e-07


In [65]:
export_GDP.preference.describe() 

count    1.172150e+05
mean     2.601226e-03
std      2.780276e-01
min      0.000000e+00
25%      3.726099e-08
50%      1.452810e-06
75%      1.772646e-05
max      4.346866e+01
Name: preference, dtype: float64

In [66]:
export_GDP.describe()

Unnamed: 0,year,품목코드,수출금액,GDP,preference
count,129308.0,129308.0,129308.0,117215.0,117215.0
mean,2017.084094,50.854719,48292.28,633376000.0,0.002601226
std,3.154192,27.332365,744850.2,2171048000.0,0.2780276
min,2012.0,1.0,0.0,36811.66,0.0
25%,2014.0,28.0,2.0,18418850.0,3.726099e-08
50%,2017.0,51.0,104.0,77625490.0,1.45281e-06
75%,2020.0,73.0,1986.0,374000000.0,1.772646e-05
max,2022.0,99.0,72445000.0,23300000000.0,43.46866


In [67]:
export_GDP.shape, export_GDP.drop_duplicates().shape  # 중복값 없음

((129308, 6), (129308, 6))

In [68]:
export_GDP = export_GDP.sort_values(by=['year','country','품목코드']).reset_index(drop=True)

In [69]:
# export_GDP 피클 파일로 압축 저장
with open('../processed/export_GDP.pkl', 'wb') as f:
    pickle.dump(export_GDP, f, pickle.HIGHEST_PROTOCOL)