# 학업탄력성 영향요인 연구
@author: sjh

- 전체 데이터 로드 및 가공

## 1. Load

In [1]:
import os
import copy
import json
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from sys import platform

import warnings
warnings.filterwarnings('ignore')

# unicode minus를 사용하지 않기 위한 설정 (minus 깨짐현상 방지)
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["figure.autolayout"] = True

# 설치된 폰트 출력
import matplotlib.font_manager as fm
font_list = [font.name for font in fm.fontManager.ttflist]
plt.rcParams['font.family'] = 'Malgun Gothic'

import seaborn as sns
sns.set_style("darkgrid")


BASE_DIR = os.getcwd()
print('>> Current OS: ', platform)
print('>> Current WD: ', BASE_DIR)

>> Current OS:  win32
>> Current WD:  c:\Users\jhun1\Dev\Research\MixedRF


## 2. Preprocessing
- 앞선 처리 데이터 불러오기
- Load.py로 처리함

In [4]:
loadedData = {'SK': [
            pd.read_excel(os.path.join(BASE_DIR,'data', 'cleanedData(SK).xlsx'), sheet_name='stu'),
            pd.read_excel(os.path.join(BASE_DIR,'data', 'cleanedData(SK).xlsx'), sheet_name='sch'),
            pd.read_excel(os.path.join(BASE_DIR,'data', 'cleanedData(SK).xlsx'), sheet_name='tch'),
            
            ],
        'US': [
            pd.read_excel(os.path.join(BASE_DIR,'data', 'cleanedData(US).xlsx'), sheet_name='stu'),
            pd.read_excel(os.path.join(BASE_DIR,'data', 'cleanedData(US).xlsx'), sheet_name='sch'),
            pd.read_excel(os.path.join(BASE_DIR,'data', 'cleanedData(US).xlsx'), sheet_name='tch'),
            ]}

In [7]:
class Preprocessing:
    def __init__(self, LoadedData, codeBook, dummyCodeBook, PV_var):
        self.data = LoadedData
        BASE_DIR = r'C:\Users\jhun1\Dropbox\[2]Project\[혼합효과 랜덤포레스트_2022]'
        self.cb = pd.read_excel(os.path.join(BASE_DIR, 'drive-download-20220816T053902Z-001', codeBook), sheet_name='변수선택(1213)')
        self.PV_var = PV_var
        with open(dummyCodeBook, encoding='utf-8') as json_file:
            self.dummyCB = json.load(json_file)
        self.testBook = {
                    'read/math/sci_1': 'PV1MATH PV1READ PV1SCIE'.split(),
                    'read/math_1': 'PV1MATH PV1READ'.split(),
                    'read/math_10': 'PV1MATH PV2MATH PV3MATH PV4MATH PV5MATH PV6MATH PV7MATH PV8MATH PV9MATH PV10MATH PV1READ PV2READ PV3READ PV4READ PV5READ PV6READ PV7READ PV8READ PV9READ PV10READ'.split(),
                    'read_1': ['PV1READ'],
                    'read_10': 'PV1READ PV2READ PV3READ PV4READ PV5READ PV6READ PV7READ PV8READ PV9READ PV10READ'.split()
                }
        self.nation_real_name = {'SK': '대한민국', 'US': '미국'} 

        self.valid_data = {}
        self._1_dummy = {}
        self._2_joined = {}
        self._3_dropNa = {}
        self._4_ESCS = {'full': {}, 'sliced': {}} # 여기서 데이터 갈라야함
        self._5_shouldBeCal = {}
        self.finalRS = {}
        
        self.rs_1_columnFull = pd.DataFrame()
        self.rs_1_columnSK = pd.DataFrame()
        self.rs_1_columnUS = pd.DataFrame()
        

    ### Needs1. is selected variable contained in both dataset
    def noDataColumn(self): # 열별로 계산
        toDrop = {}
        
        for nationName, nationalData in self.data.items():

            toDrop[nationName] = []
            for idx, (label, inputDf) in enumerate(zip('stu sch tch'.split(), nationalData)):
                if label == 'tch':
                    continue
                else:
                    # print(label)
                    for column in inputDf.columns:
                        if inputDf[column].isna().sum() > (inputDf.shape[0] * 0.8):
                            # print('>>> over 80% is NA: ', column)
                            toDrop[nationName].append(column)
                        
                        elif 'missing' in inputDf[column].values:
                            # print('>>> missing: ', column)
                            toDrop[nationName].append(column)
                        
                        else:
                            continue
        if (len(toDrop['SK']) ==0) and (len(toDrop['US']) ==0):
            self.valid_data = self.data
        else:
            # raise ValueError("NaN column exist, check for this")
            before = self.data['SK'][0].shape[1]
            for nation, groups in self.data.items():
                self.valid_data[nation] = []
                for idx, data in enumerate(groups):
                    if idx == 0 :
                        self.valid_data[nation].append(data.drop('PERSPECT', axis=1))
                    else: 
                        self.valid_data[nation].append(data)

            after = self.valid_data['SK'][0].shape[1]
            assert before - after == 1, print(after, before)

        return toDrop
            # assert len(toDrop[nationName]) == 2, print(toDrop)


    def Dummy(self, doDummy):
        # match key and value from codeBook
        # print('\n\n>>>> 1. Dummy coding')
        def matchKV(codeBookDict, inputList):
            outputLS = []
            for val in inputList:
                try:
                    outputLS.append(codeBookDict[val])
                except KeyError:
                    outputLS.append(np.nan)
            
            return outputLS
        
        if doDummy == True:
            notDummyCol1 = self.cb[self.cb['categories'] == 'identifier'].index
            notDummyCol2 = self.cb[self.cb['categories'] == 'resilient status'].index
            notDummyCol3 = self.cb[self.cb['file name'] == 'should be caculated'].index
            
            toDummy = self.cb.drop(list(notDummyCol1)+list(notDummyCol2)+list(notDummyCol3), axis=0) # 더미 변환 안할 변수 행 삭제함
            # display(toDummy)

            for nationalName, inputNational in self.valid_data.items():
                outputNational = copy.deepcopy(inputNational)

                for idx, row in toDummy.iterrows(): # 변수별로 반복문
                    variable = row['NAME']
                    
                    if type(row['file name']) != str: # 분석에서 제외할 변수가 있어서 버림
                        continue

                    else:
                        if ('STU' in row['file name']) and (variable in self.dummyCB['stu']):
                            outputLS = matchKV(self.dummyCB['stu'][variable], outputNational[0][variable])
                            outputNational[0][variable] = outputLS 

                    # 학교, 교사 데이터는 더미코딩할 것 없음
                self._1_dummy[nationalName] = outputNational
        elif doDummy == False:
            self._1_dummy = copy.deepcopy(self.valid_data)
        else:
            raise TypeError('>> Error: check option Type')


    def Join(self):
        # print('\n\n>>>> 2. Join DataFrame')

        for nationalName, inputNational in self._1_dummy.items():
            # print('>> join nation: ', nationalName)
            inputNational[0].reset_index(drop=True, inplace=True)
            
            outputDf = copy.deepcopy(inputNational[0])
            # print('>> before ', outputDf.shape)
            before = outputDf.shape


            inputNational[1].drop(['CNTRYID', 'CNT'], axis=1, inplace=True)
            if inputNational[1].index.name == 'CNTSCHID':
                pass
            else:
                inputNational[1].set_index('CNTSCHID', drop=True, inplace=True)
            
            if inputNational[1].shape[1] == 0:
                # print('>> school data is empty')
                pass
            else:
                for idx, row in tqdm(outputDf.iterrows(), desc=">> mapping"):
                    toBeInput = inputNational[1].loc[row['CNTSCHID']].values # 학생 데이터에 들어가야할 학교 데이터 찾기
                    assert len(toBeInput) == inputNational[1].shape[1]
                    
                    toBeInput_T = toBeInput.reshape(1, 8)
                    outputDf.loc[idx, list(inputNational[1].columns)] = toBeInput_T[0]
            
                after = outputDf.shape
                print('>>>> Bef: ', before, '....', 'Aft: ', after)
                assert 'EDUSHORT' in outputDf.columns

            self._2_joined[nationalName] = outputDf

    def DropStudent(self, isVisualize=True):
        # 각 column 별로 데이터 검수
        # print('\n>>>> 3. Verify na and Drop student')
        # print(self._2_joined.keys())
        def column_wise(inputData):
            if type(inputData) == dict:
                merged = pd.concat([inputData['SK'], inputData['US']])
                assert merged.shape[0] == inputData['SK'].shape[0] + inputData['US'].shape[0]
            elif type(inputData) == pd.DataFrame:
                merged = copy.deepcopy(inputData)
            
            else:
                raise TypeError('>> Error: Check your input D type')
                

            describeDF = merged.describe().T
            describeDF['NA_ratio'] = round(
                100 - describeDF['count']/merged.shape[0]*100,
                 2
                 )

            newColumnOrder = [describeDF.columns[0], 'NA_ratio'] + list(describeDF.columns[1:-1])
            describeDF= describeDF[newColumnOrder]
            return describeDF

        # 각 학생별로 데이터 검수
        def row_wise(inputData):
            merged = pd.concat([inputData['SK'], inputData['US']])
            assert merged.shape[0] == inputData['SK'].shape[0] + inputData['US'].shape[0]
            # unlike column wise, we prepare data with 

            for_histogram = {}
            for label, data in zip(['full', 'SK', 'US'], [merged, inputData['SK'], inputData['US']]):
                for_histogram[label] = []

                for i in range(len(data.index)) :
                    na_ratio = round((data.iloc[i].isnull().sum()/data.shape[1]) * 100, 0)
                    for_histogram[label].append(na_ratio)

            if isVisualize == True:
                fig = plt.figure(figsize=(17,6))

                plt.subplot(1, 3, 1)
                plt.hist(for_histogram['full'])
                plt.title('\n전체 데이터\n')
                plt.xlabel('\n전체 변수 대비 결측비율(%)\n')
                plt.ylabel('빈도')
                
                plt.subplot(1, 3, 2)
                plt.hist(for_histogram['SK'])
                plt.title('\nSouth Korea\n')
                plt.xlabel('\n전체 변수 대비 결측비율(%)\n')
                plt.ylabel('빈도')
                
                plt.subplot(1, 3, 3)
                plt.hist(for_histogram['US'])
                plt.title('\nUnited States\n')
                plt.xlabel('\n전체 변수 대비 결측비율(%)\n')
                plt.ylabel('빈도')

                plt.savefig(os.path.join(BASE_DIR, 'data', f'NA_ratio.jpg'))
                plt.show()
            else:
                pass

            return for_histogram
        
        def dropOver(inputData, rowWiseResult):
            assert type(rowWiseResult) == list
            output = copy.deepcopy(inputData)

            toDrop = []
            for idx, sumNA in zip(output.index, rowWiseResult):
                if sumNA > 30:
                    toDrop.append(idx)
            
            before = output.shape[0]
            output.drop(toDrop, axis=0, inplace=True)
            after = output.shape[0]
            # print('>> NA drop: ', before - after)

            return output
        
        self.rs_1_columnFull = column_wise(self._2_joined)
        self.rs_1_columnSK = column_wise(self._2_joined['SK'])
        self.rs_1_columnUS = column_wise(self._2_joined['US'])

        rowWiseNA = row_wise(self._2_joined)
        self._3_dropNa['SK'] = dropOver(self._2_joined['SK'], rowWiseResult=rowWiseNA['SK'])
        self._3_dropNa['US'] = dropOver(self._2_joined['US'], rowWiseResult=rowWiseNA['US'])
        
        
    def ESCS(self, threshold, isVisualize=True):
        r"""
        데이터를 쪼개줌
        full, slice
        
        1. PV_var: integer 1 ~ 10
        arg1. test_key : if 'score_calculating_method'
        arg2. Threshold_Reading_Score : score value
        4. isVisualize: if true, visualization proceed
        """
        print('\n>>>> 4. Slicing data by ESCS')
        
        def thresholdCalculator(inputData,
                                PV_var,
                                threshold):
            r"""학업탄력성을 판별하는 기준값을 계산함"""
            assert type(PV_var) == int, print('>> Error__PV_var: ', PV_var)
            assert PV_var > 0, print('>> Error__PV_var: ', PV_var)
            assert PV_var < 11, print('>> Error__PV_var: ', PV_var)


            threshold_dict = {'SK': {}, 'US': {}}

            for nationalName, inputNational in inputData.items():
                # cal academic score
                targetColumn = ['PV'+ str(PV_var) + 'READ']
                inputNational['AcademicScore'] = inputNational.loc[:, targetColumn].mean(axis=1)
                
                assert type(threshold) == int, print('Insert validate type args : ', threshold)
                threshold_dict[nationalName]['academic_score'] = threshold
                
                # cal escs score
                threshold_dict[nationalName]['escs_score'] = inputNational['ESCS'].quantile(0.25)

            return threshold_dict, inputData # 새로운 열이 추가되었으므로 리턴해서 사용해야함
        
        def escsSlice(inputDict, escsThreshold):
            r"""계산하기"""
            assert type(inputDict) == dict, print('>> Error: must input Dict')
            assert type(escsThreshold) == dict
            output = {'SK': pd.DataFrame(), 'US': pd.DataFrame()}
            for nationalName, inputNational in inputDict.items():
                
                before = inputNational.shape[0]
                toDrop = []
                for idx, val in zip(inputNational['ESCS'].index, inputNational['ESCS'].values):
                    if val < escsThreshold[nationalName]['escs_score']:
                        continue
                    else:
                        toDrop.append(idx) # escs 하위 25%를 넘는 친구들은 버림
                
                
                output[nationalName] = inputNational.drop(toDrop, axis=0)
                after = output[nationalName].shape[0]
                # print('>> before: ', before, '>> after: ', after)
            
            return output


        def quantileCalculator( 
                            inputData, # 전체 Full, escs 하위 25%로 데이터셋이 2개로 나뉘므로 인풋을 줘야함
                            option, # full: 전체 데이터, sliced: 잘린 데이터
                            AcademicThreshold
                            ):
            
            assert type(AcademicThreshold) == dict

            output = {'SK': pd.DataFrame(), 'US': pd.DataFrame()}
            count_ratio = {'SK': [], 'US': []}
            for IDX, (nationalName, inputNational) in enumerate(inputData.items()):
                total = inputNational.shape[0]
                
                iamResilient = []
                if option == 'full':
                    escsVar = inputNational['ESCS'].quantile(0.25) # 하위 25%
                    for idx, row in inputNational.iterrows():
                        if row['AcademicScore'] > AcademicThreshold[nationalName]['academic_score'] and row['ESCS'] < AcademicThreshold[nationalName]['escs_score']: # sliced 데이터에서는 이 기준을 만족할 수 없음
                            iamResilient.append(1)
                        else:
                            iamResilient.append(0)

                elif option == 'sliced':
                    for idx, row in inputNational.iterrows():
                        if row['AcademicScore'] > AcademicThreshold[nationalName]['academic_score']: # sliced 데이터는 escs 기준 필요 없음
                            iamResilient.append(1)
                        else:
                            iamResilient.append(0)

                inputNational['resilient'] = iamResilient
                resilientCount = [x for x in iamResilient if x ==1]
                resilientRatio = round(len(resilientCount)/total*100, 2)
                # print(f'>> 회복탄력성 학생수({nationalName}): ', len(resilientCount), f'({resilientRatio})%')

                output[nationalName] = inputNational
                count_ratio[nationalName].append(len(resilientCount))
                count_ratio[nationalName].append(resilientRatio)

            return output, count_ratio

        def visualize(inputData,
                    option, # full: 전체 데이터, sliced: 잘린 데이터
                    figName, # 그림 제목
                    AcademicThreshold
                        ):

            fig = plt.figure(figsize=(17,9))
            for IDX, (nationalName, inputNational) in enumerate(inputData.items()):

                plt.subplot(2, 2, 2*IDX+1)
                plt.hist(inputNational['AcademicScore'])
                plt.title(f'\n학업성취{self.nation_real_name[nationalName]}\n')
                plt.xlabel('\n점수\n')
                plt.axvline(AcademicThreshold[nationalName]['academic_score'], color='r', linewidth=1, linestyle='--')
                
                plt.subplot(2, 2, 2*IDX+2)
                plt.hist(inputNational['ESCS'])
                plt.title(f'\n사회문화경제{self.nation_real_name[nationalName]}\n')
                plt.xlabel('\n점수\n')
                if option=='full':
                    plt.axvline(AcademicThreshold[nationalName]['escs_score'], color='r', linewidth=1, linestyle='--')

                
            plt.savefig(os.path.join(BASE_DIR, 'rs', f'{figName}_{option}.jpg'))
            plt.show()
        
        ## 1. calculate threshold value
        AcademicThreshold, newData_dict = thresholdCalculator(self._3_dropNa,
                                                            PV_var = self.PV_var,
                                                            threshold=threshold) ## 학업성취 코딩 방법을 바꿀 때 여기 arg를 조정
        
        
        ## 2. slice
        self._4_ESCS['full'] = copy.deepcopy(newData_dict) # no drop case, so just copied
        self._4_ESCS['sliced'] = escsSlice(newData_dict, escsThreshold = AcademicThreshold)
        assert type(self._4_ESCS['full']) == dict, print(self._4_ESCS['full'])


        ## 3. labeling resilient student
        self._4_ESCS['full'], temp = quantileCalculator(inputData=self._4_ESCS['full'], 
                                                option = 'full',
                                                AcademicThreshold= AcademicThreshold)
        self._4_ESCS['sliced'], resilientCount_Ratio = quantileCalculator(inputData=self._4_ESCS['sliced'], 
                                                option = 'sliced',
                                                AcademicThreshold= AcademicThreshold)

        ## 4. visualize resilient student
        if isVisualize == True:
            visualize(self._4_ESCS['full'], option='full', figName='읽10', AcademicThreshold= AcademicThreshold)
            visualize(self._4_ESCS['sliced'], option = 'sliced', figName ='읽10(target paper)', AcademicThreshold= AcademicThreshold)
        else:
            pass
        
        return resilientCount_Ratio

    
    # should be calculated 변수들 계산하는 것임
    def shouldBeCalculated(self):
        # print('\n\n>>>> 6. Should Be Calculated')
        
        def schoolMean(inputDf, whichVar):
            assert type(whichVar) == list
            outputMean = {}
            for sch_id in inputDf['CNTSCHID'].values:
                if sch_id in outputMean.keys():
                    continue
                
                else:
                    temp1 = inputDf[inputDf['CNTSCHID'] == sch_id]
                    temp2 = temp1.loc[:, whichVar] 
                    assert len(temp2.columns) == len(whichVar)
                    meanVal = np.nanmean(temp2.values)
                    assert type(meanVal) == np.float64, print('Error : ', type(meanVal))

                    outputMean[sch_id] = meanVal
            
            return outputMean
        
        
        def meanMapping(inputColumn, mean_dict):
            outputLS = []
            for idx, sch_id in enumerate(inputColumn.values):
                outputLS.append(mean_dict[sch_id])

            return outputLS
        

        def matching(inputData, codeBook):
            output = copy.deepcopy(inputData)
            shouldBeCal = self.cb[self.cb['file name'] == 'should be caculated']
            assert len(shouldBeCal) == 2, print('Error: check self.cb')
            for national in output.keys():
                beforeShape = output[national].shape[1]
                
                calVal = list(codeBook[codeBook['categories'] == 'resilient status']['NAME'])
                calVal.remove('ESCS')
                
                for variable in shouldBeCal['NAME'].values:
                
                    if variable == 'AVG_S_TEST':    
                        mean_dict = schoolMean(output[national], calVal)
                        
                    elif variable == 'AVG_S_ESCS':
                        mean_dict = schoolMean(output[national], ['ESCS'])

                    #평균 dict 활용해서 매칭 진행
                    outputLS = meanMapping(output[national]['CNTSCHID'], mean_dict)
                    assert len(outputLS) == output[national].shape[0], print('Error: ', len(outputLS))
                    output[national][variable] = outputLS # 학교 데이터이므로, 학교에 맞춰서 추가하기

                afterShape = output[national].shape[1]
                assert afterShape - beforeShape == 2, print('Beofre: ', beforeShape, ' ... ', 'After: ', afterShape)
            
            return output
        
        if 'should be calculated' in self.cb['file name'].values:
            self._5_shouldBeCal['full'] = matching(self._4_ESCS['full'], codeBook = self.cb)
            self._5_shouldBeCal['sliced'] = matching(self._4_ESCS['sliced'], codeBook = self.cb)
        else:
            self._5_shouldBeCal['full'] = copy.deepcopy(self._4_ESCS['full'])
            self._5_shouldBeCal['sliced'] = copy.deepcopy(self._4_ESCS['sliced'])
            

    
    def AdjustMinor(self):
        r"""마이너한 것들을 조정하기 위함"""
        
        def Merge(inputData):
            r"""두 데이터를 나라 row를 만들고, 합치기 위함"""
            output = pd.concat([inputData['SK'], inputData['US']], axis=0)
            assert inputData['SK'].shape[0] + inputData['US'].shape[0] == output.shape[0]

            dropAcademic = ['CNTRYID', 'AcademicScore']
            for column in output.columns:
                if 'PV' in column:
                    dropAcademic.append(column)

            output.drop(dropAcademic, axis=1, inplace=True)
            # print('>> columns: ', output.columns)
            return output

        def columnOrder(inputData,
                        important_columns=['resilient']):
            r"""spss 편하도록, 주요 변수들을 앞으로 빼는 작업"""
            column_ID = ['CNT', 'CNTSCHID', 'CNTSTUID']

            inputData.set_index(column_ID+important_columns, inplace=True)
            inputData.reset_index(inplace=True)

            return inputData

        self.finalRS['full'] = Merge(self._5_shouldBeCal['full'])
        self.finalRS['sliced'] = Merge(self._5_shouldBeCal['sliced'])

        self.finalRS['full'] = columnOrder(self.finalRS['full'])
        self.finalRS['sliced'] = columnOrder(self.finalRS['sliced'])

    def Save(self):
        with pd.ExcelWriter(os.path.join(BASE_DIR, 'rs', f'preprocessing{self.PV_var}.xlsx')) as writer:
            self.finalRS['sliced'].to_excel(writer, sheet_name='sliced', index=False)


# processor = Preprocessing(LoadedData=loadedData, codeBook=Loader.cb, dummyCodeBook='dummyCoding.json', PV_var=1)
# processor.noDataColumn()
# processor.Dummy(doDummy=False) # 굳이 dummy할 필요가 없음, rf에서 categorical / numerical 인식해야함
# processor.Join()
# processor.DropStudent(isVisualize=False)
# processor.ESCS(480, isVisualize=False) # Lv4: 553, Lv3: 480, Lv2: 407
# processor.shouldBeCalculated()
# processor.AdjustMinor()
# processor.Save()

In [8]:
plot_sk = pd.DataFrame(index=range(1, 11), columns=['count', 'ratio'])
plot_us = pd.DataFrame(index=range(1, 11), columns=['count', 'ratio'])

for idx in tqdm(range(1, 11)):
    print('>>>> ', idx)
    processor = Preprocessing(LoadedData=loadedData, codeBook='PISA2018_CODEBOOK (변수선택-공유).xlsx', dummyCodeBook='dummyCoding.json', PV_var=idx)
    processor.noDataColumn()
    processor.Dummy(doDummy=False) # 굳이 dummy할 필요가 없음, rf에서 categorical / numerical 인식해야함
    processor.Join()
    processor.DropStudent(isVisualize=False)
    resilientCount_Ratio = processor.ESCS(480, isVisualize=False) # Lv4: 553, Lv3: 480, Lv2: 407
    print('> ', resilientCount_Ratio)
    processor.shouldBeCalculated()
    processor.AdjustMinor()
    processor.Save()

    plot_sk.loc[idx, 'count'] = resilientCount_Ratio['SK'][0]
    plot_sk.loc[idx, 'ratio'] = resilientCount_Ratio['SK'][1]
    plot_us.loc[idx, 'count'] = resilientCount_Ratio['US'][0]
    plot_us.loc[idx, 'ratio'] = resilientCount_Ratio['US'][1]


display(plot_sk)

  0%|          | 0/10 [00:00<?, ?it/s]

>>>>  1

>>>> 4. Slicing data by ESCS
>  {'SK': [876, 52.96], 'US': [483, 41.14]}


 10%|█         | 1/10 [00:04<00:41,  4.59s/it]

>>>>  2

>>>> 4. Slicing data by ESCS
>  {'SK': [857, 51.81], 'US': [475, 40.46]}


 20%|██        | 2/10 [00:09<00:36,  4.57s/it]

>>>>  3

>>>> 4. Slicing data by ESCS
>  {'SK': [863, 52.18], 'US': [473, 40.29]}


 30%|███       | 3/10 [00:13<00:31,  4.51s/it]

>>>>  4

>>>> 4. Slicing data by ESCS
>  {'SK': [855, 51.69], 'US': [487, 41.48]}


 40%|████      | 4/10 [00:18<00:26,  4.49s/it]

>>>>  5

>>>> 4. Slicing data by ESCS
>  {'SK': [852, 51.51], 'US': [474, 40.37]}


 50%|█████     | 5/10 [00:22<00:22,  4.51s/it]

>>>>  6

>>>> 4. Slicing data by ESCS
>  {'SK': [838, 50.67], 'US': [484, 41.23]}


 60%|██████    | 6/10 [00:27<00:17,  4.48s/it]

>>>>  7

>>>> 4. Slicing data by ESCS
>  {'SK': [858, 51.87], 'US': [477, 40.63]}


 70%|███████   | 7/10 [00:31<00:13,  4.45s/it]

>>>>  8

>>>> 4. Slicing data by ESCS
>  {'SK': [847, 51.21], 'US': [463, 39.44]}


 80%|████████  | 8/10 [00:35<00:08,  4.42s/it]

>>>>  9

>>>> 4. Slicing data by ESCS
>  {'SK': [859, 51.93], 'US': [485, 41.31]}


 90%|█████████ | 9/10 [00:40<00:04,  4.40s/it]

>>>>  10

>>>> 4. Slicing data by ESCS
>  {'SK': [866, 52.36], 'US': [475, 40.46]}


100%|██████████| 10/10 [00:44<00:00,  4.45s/it]


Unnamed: 0,count,ratio
1,876,52.96
2,857,51.81
3,863,52.18
4,855,51.69
5,852,51.51
6,838,50.67
7,858,51.87
8,847,51.21
9,859,51.93
10,866,52.36


In [None]:
display(plot_us)

Unnamed: 0,count,ratio
1,483,41.14
2,475,40.46
3,473,40.29
4,487,41.48
5,474,40.37
6,484,41.23
7,477,40.63
8,463,39.44
9,485,41.31
10,475,40.46


plt.bar
should be plotted