In [2]:
import os
import copy
import json
import numpy as np
import pandas as pd
from datetime import datetime
from tqdm import tqdm
from sys import platform

import warnings
warnings.filterwarnings('ignore')

# unicode minus를 사용하지 않기 위한 설정 (minus 깨짐현상 방지)
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
plt.rcParams['axes.unicode_minus'] = False
plt.rcParams["figure.autolayout"] = True

# 설치된 폰트 출력
import matplotlib.font_manager as fm
font_list = [font.name for font in fm.fontManager.ttflist]
plt.rcParams['font.family'] = 'Malgun Gothic'


BASE_DIR = os.getcwd()
print('>> Current OS: ', platform)
print('>> Current WD: ', BASE_DIR)

>> Current OS:  win32
>> Current WD:  c:\Users\jhun1\Proj\Research\MixedRF


In [43]:
class Load:
    def __init__(self, codeBook):
        if 'darwin' in platform:
            self.BASE_DIR = '/Users/huni/Dropbox/[3]Project/[혼합효과 랜덤포레스트_2022]'
        else:
            self.BASE_DIR = r'C:\Users\jhun1\Dropbox\[3]Project\[혼합효과 랜덤포레스트_2022]'
        
        global BASE_DIR
        codebook_Folder = 'drive-download-20220816T053902Z-001'

        print('>>>>> Init: load raw data')
        self.rawStu = pd.read_excel(os.path.join(BASE_DIR, 'data', 'rawData(HK).xlsx'), sheet_name='stu')
        self.rawSCH = pd.read_excel(os.path.join(BASE_DIR, 'data', 'rawData(HK).xlsx'), sheet_name='sch')
        self.rawTCH = pd.read_excel(os.path.join(BASE_DIR, 'data', 'rawData(HK).xlsx'), sheet_name='tch')
        self.dataLS = [self.rawStu, self.rawSCH, self.rawTCH]

        # desciptive
        print('>> Stu data set', self.rawStu.shape)
        print('>> Sch data set', self.rawSCH.shape)
        print('>> Tch data set', self.rawTCH.shape)
        
        
        self.cb = pd.read_excel(os.path.join(self.BASE_DIR, codebook_Folder, codeBook),
                            skiprows=[0] # 맨 윗줄 제거
                            )




Loader = Load(codeBook='TargetPaper_CODEBOOK.xlsx')


>>>>> Init: load raw data
>> Stu data set (6037, 1119)
>> Sch data set (152, 197)
>> Tch data set (3754, 351)


In [52]:
class Preprocessing(Load):
    def __init__(self, dataLS, cb):
        # super().__init__(dataLS, codebook = cb)
        self.dataLS = dataLS
        self.cb = cb
    
    def defaultCleaner(self):
        print('\n\n>>>> Cleaning: select variable')

        ### slicing only codebook independent var
        def drop_Unidentified_variable(codebook):
            variable_name = list(codebook['variable_name'].values)
            print('>>>> 1. left only identified variable', len(variable_name))
            new = [variable for variable in variable_name if variable != '?']
            print('> cleaned :', len(new))
            return new


        def cleaningVariable(dataLS, using_variable_list):
            print('\n>>>> 2. drop useless variable')

            """
            1. iteration through data set (stu, sch, tch)
            2. iteration through every column
            3. save column when it is contained
            """
            new_data_ls = {'Stu': [], 'Sch': [], 'Tch': []}

            count = 0
            for data_set, label in zip(dataLS, new_data_ls.keys()):
                toDrop = []
                for col in data_set.columns:
                    if col in using_variable_list:
                        count += 1
                    
                    elif col == 'ESCS':
                        continue

                    else:
                        toDrop.append(col)
                
                newDF = data_set.drop(toDrop, axis=1)
                new_data_ls[label] = newDF
                print(f'> {label} data only left.. : ', len(newDF.columns))


            assert count == len(using_variable_list), print('*error: ', count, "...", len(using_variable_list))
            
            return new_data_ls


        var_ls =  drop_Unidentified_variable(self.cb) # cleaening unidentified variable
        output = cleaningVariable(dataLS = self.dataLS, using_variable_list=var_ls)
        self.cleaned = output['Stu']
        assert type(self.cleaned) == pd.DataFrame

        return None

    def academic(self):
        #!# 학문성취 계산해서, resilient 계산하기
        pass
    
    def slice(self):
        threshold = self.cleaned['ESCS'].quantile(0.25)
        df = copy.deepcopy(self.cleaned)
        before = df.shape[0]

        toDrop = []
        for idx, val in zip(df.index, df['ESCS'].values):
            if val < threshold:
                continue
            else:
                toDrop.append(idx)

        output = df.drop(toDrop, axis = 0)
        after = output.shape[0]
        print('>> before: ', before, '>> after: ', after)
        return output


    
preprocessor = Preprocessing(dataLS = Loader.dataLS, cb = Loader.cb)
preprocessor.defaultCleaner()
df = preprocessor.slice()



>>>> Cleaning: select variable
>>>> 1. left only identified variable 30
> cleaned : 28

>>>> 2. drop useless variable
> Stu data only left.. :  29
> Sch data only left.. :  0
> Tch data only left.. :  0
>> before:  6037 >> after:  1459


In [54]:
df.to_excel(os.path.join(BASE_DIR, 'data', 'cleanedData(HK).xlsx'))

In [None]:
# SPSS 26.0의 MCMC 사용해서 결측치 대체함 
# https://www.statisticshowto.com/missing-values-spss/


# reading achivement top 25%를 resilient로 했다는데 한문항만 썼는지, 다른 문항들 전체를 썼는지는 확인 필요함