In [5]:
import numpy as np
import pandas as pd

In [6]:
import matplotlib.pyplot as plt
from matplotlib import font_manager

# 기본 정보 출력 클래스
class Information:
    def __init__(self):
        self.SMALL_SIZE = 7

    def _set_kor_font(self):
        # 한글 폰트 설정
        self.FONT_PATH = "./NanumSquareB.ttf"
        self.FONT_NAME = font_manager.FontProperties(fname=self.FONT_PATH).get_name()
        plt.rc('font', family=self.FONT_NAME)

    # 데이터의 기본정보를 출력하는 함수
    def print_basic_info(self, data):
        """
        :param data: (DataFrame) data
        :return: None
        """
        # 한글 폰트 설정
        self._set_kor_font()

        print("Data shape : ", data.shape, end='\n\n')
        print(data.info(), end='\n\n')
        print("Data Null Sum Percent \n", round(data.isnull().sum()/data.shape[0] * 100, 2), end='\n\n')

    # 수치형 데이터의 평균, 중위값 등 기본적 통계 지표를 출력하는 함수
    def print_statistics_ind(self, data, col_name, qlt=False):
        """
        :param data: (DataFrame) data
        :param col_name: (str) Column name for which you want to see indicators
        :param qlt: (bool) If column is qualitative variable, this parameter is true
        :return: None
        """
        # 한글 폰트 설정
        self._set_kor_font()

        # 질적변수인 경우
        if qlt:
            print("Data category: ", data[col_name].unique())
            print("Data category: ", data[col_name].value_counts(sort=False).values)

            x_feature_ratio = data[col_name].value_counts(sort=False)
            x_feature_index = x_feature_ratio.index

            # x값 시각화
            plt.plot(aspect='auto')
            plt.pie(x_feature_ratio, labels=x_feature_index, autopct='%1.1f%%')
            plt.title(str(col_name) + '\'s ratio in total')

            plt.show()

        # 양적 변수인 경우
        else:
            print("Data Max : ", data[col_name].max())
            print("Data Min : ", data[col_name].min())
            print("Data Mean : ", data[col_name].mean())
            print("Data Median : ", data[col_name].median())
            print("Data Top 05% : ", np.percentile(data[col_name].values, 95))
            print("Data Top 25% : ", np.percentile(data[col_name].values, 75))
            print("Data Top 75% : ", np.percentile(data[col_name].values, 25))
            print("Data Top 95% : ", np.percentile(data[col_name].values, 5))
            print("Data Variance : {0: .3f}".format(data[col_name].var()))
            print("Data Standard deviation: {0: .3f}".format(data[col_name].std()))
            print()

            plt.plot()
            plt.boxplot(data[col_name])

            plt.show()

In [7]:
FILE_PATH = "./"

# 학습 데이터, 테스트 데이터, 제출 데이터 셈플 load
train_data = pd.read_csv(FILE_PATH + 'train.csv')
test_data = pd.read_csv(FILE_PATH + 'test.csv')
sub_data = pd.read_csv(FILE_PATH + 'sample_submission.csv')
age_gener_info_data = pd.read_csv(FILE_PATH + 'age_gender_info.csv')

In [9]:
train_data.head(5)

Unnamed: 0,단지코드,총세대수,임대건물구분,지역,공급유형,전용면적,전용면적별세대수,공가수,자격유형,임대보증금,임대료,도보 10분거리 내 지하철역 수(환승노선 수 반영),도보 10분거리 내 버스정류장 수,단지내주차면수,등록차량수
0,C2483,900,아파트,경상북도,국민임대,39.72,134,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
1,C2483,900,아파트,경상북도,국민임대,39.72,15,38.0,A,15667000,103680,0.0,3.0,1425.0,1015.0
2,C2483,900,아파트,경상북도,국민임대,51.93,385,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
3,C2483,900,아파트,경상북도,국민임대,51.93,15,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
4,C2483,900,아파트,경상북도,국민임대,51.93,41,38.0,A,27304000,184330,0.0,3.0,1425.0,1015.0
