### 데이터 불러오기

In [1]:
import pandas as pd
import numpy as np
import os

def load_hotel_reserve():
  customer_tb = pd.read_csv('./data/customer.csv')
  hotel_tb = pd.read_csv('./data/hotel.csv')
  reserve_tb = pd.read_csv('./data/reserve.csv')
  return customer_tb, hotel_tb, reserve_tb


def load_holiday_mst():
  holiday_tb = pd.read_csv('./data/holiday_mst.csv',
                           index_col=False)
  return holiday_tb


def load_production():
  production_tb = pd.read_csv('./data/production.csv')
  return production_tb


def load_production_missing_num():
  production_tb = pd.read_csv('./data/production_missing_num.csv')
  return production_tb


def load_production_missing_category():
  production_tb = pd.read_csv('./awesomebook-master/data/production_missing_category.csv')
  return production_tb


def load_monthly_index():
  monthly_index_tb = pd.read_csv('./data/monthly_index.csv')
  return monthly_index_tb


def load_meros_txt():
  with open('./data/txt/meros.txt', 'r') as f:
    meros = f.read()
    f.close()
  return meros


In [64]:
customer_tb, hotel_tb, reserve_tb = load_hotel_reserve()

### 수치데이터 전처리

#### 수치데이터 형식으로 변환

일반 형변환 

In [3]:
# 데이터 형 확인
print(type(40000/3))      # <class 'float'>

# 정수형 변환
print(int(40000 / 3))     # 13333 (소수점 버림)

# 실수형 변환
print(float(40000 / 3))   # 13333.333...

<class 'float'>
13333
13333.333333333334


데이터 프레임에서의 형 변환

In [5]:
df = pd.DataFrame({'value': [40000 / 3]})

# 데이터 타입 확인
print(df.dtypes)

value    float64
dtype: object


In [None]:
# 다양한 정수로 변환
print(df['value'].astype('int8'))    # -128 ~ 127
print(df['value'].astype('int16'))   # -32,768 ~ 32,767
print(df['value'].astype('int32'))   # 약 -21억 ~ 21억
print(df['value'].astype('int64'))   # 매우 큰 범위

0    21
Name: value, dtype: int8
0    13333
Name: value, dtype: int16
0    13333
Name: value, dtype: int32
0    13333
Name: value, dtype: int64


In [11]:
# 다양한 실수로 변환 
print(df['value'].astype('float16'))   # 반정밀도
print(df['value'].astype('float32'))   # 단정밀도
print(df['value'].astype('float64'))   # 배정밀도 (기본값)
# print(df['value'].astype('float128'))  # 확장 정밀도 
# window에서 float128은 지원하지 않는다고 함 

0    13336.0
Name: value, dtype: float16
0    13333.333008
Name: value, dtype: float32
0    13333.333333
Name: value, dtype: float64


  has_large_values = (abs_vals > 1e6).any()


In [12]:
# 파이썬 기본 타입으로 지정
print(df['value'].astype(int))    # 파이썬 기본 int
print(df['value'].astype(float))  # 파이썬 기본 float

0    13333
Name: value, dtype: int64
0    13333.333333
Name: value, dtype: float64


#### 로그 변환 등 지수 기반 스케일링

In [18]:
reserve_tb['total_price']

0        97200
1        20600
2        33600
3       194400
4        68100
         ...  
4025     16000
4026     41800
4027     74800
4028    540000
4029     44100
Name: total_price, Length: 4030, dtype: int64

In [17]:
import numpy as np

# total_price를 1000으로 나누고 1을 더해 대수화
reserve_tb['total_price_log'] = reserve_tb['total_price'] \
    .apply(lambda x: np.log(x / 1000 + 1))
reserve_tb['total_price_log']

0       4.587006
1       3.072693
2       3.543854
3       5.275049
4       4.235555
          ...   
4025    2.833213
4026    3.756538
4027    4.328098
4028    6.293419
4029    3.808882
Name: total_price_log, Length: 4030, dtype: float64

#### 순위 또는 분위수를 활용한 스케일링 (수치형의 범주화)

In [21]:
customer_tb['age']

0      41
1      38
2      49
3      43
4      31
       ..
995    44
996    35
997    32
998    48
999    39
Name: age, Length: 1000, dtype: int64

In [20]:
# age를 10단위로 범주화
customer_tb['age_rank'] = np.floor(customer_tb['age'] / 10) * 10
customer_tb['age_rank']

0      40.0
1      30.0
2      40.0
3      40.0
4      30.0
       ... 
995    40.0
996    30.0
997    30.0
998    40.0
999    30.0
Name: age_rank, Length: 1000, dtype: float64

#### 정규화 (Normalization)

In [69]:
# StandardScaler
from sklearn.preprocessing import StandardScaler

# 소수점을 다루기 위해 실수형 변환
reserve_tb['people_num'] = reserve_tb['people_num'].astype(float)

# 정규화 객체 생성
ss = StandardScaler()

# 정규화 변환 (fit_transform: 학습과 변환을 동시에)
result = ss.fit_transform(reserve_tb[['people_num', 'total_price']])

# 결과를 새로운 컬럼에 저장
reserve_tb['people_num_normalized'] = [x[0] for x in result]
reserve_tb['total_price_normalized'] = [x[1] for x in result]

In [44]:
reserve_tb.head()

Unnamed: 0,reserve_id,hotel_id,customer_id,reserve_datetime,checkin_date,checkin_time,checkout_date,people_num,total_price,people_num_normalized,total_price_normalized
0,r1,h_75,c_1,2016-03-06 13:09:42,2016-03-26,10:00:00,2016-03-29,4.0,97200,1.300709,-0.053194
1,r2,h_219,c_1,2016-07-16 23:39:55,2016-07-20,11:30:00,2016-07-21,2.0,20600,-0.483753,-0.747822
2,r3,h_179,c_1,2016-09-24 10:03:17,2016-10-19,09:00:00,2016-10-22,2.0,33600,-0.483753,-0.629935
3,r4,h_214,c_1,2017-03-08 03:20:10,2017-03-29,11:00:00,2017-03-30,4.0,194400,1.300709,0.82824
4,r5,h_16,c_1,2017-09-05 19:50:37,2017-09-22,10:30:00,2017-09-23,3.0,68100,0.408478,-0.31708


#### 이상값 처리 (Outlier Treatment)

In [65]:
# 이상값 처리 전 데이터 복사 (비교용)
reserve_tb_before = reserve_tb.copy()

In [66]:
# 평균 ± 3*표준편차 범위 내의 데이터만 유지
reserve_tb = reserve_tb[
    (abs(reserve_tb['total_price'] - np.mean(reserve_tb['total_price'])) / np.std(reserve_tb['total_price'])) <=3
].reset_index()

In [67]:
reserve_tb_before['total_price'].count()

np.int64(4030)

In [68]:
reserve_tb['total_price'].count() # 삭제됨

np.int64(3932)

#### 주성분 기반 차원 축소 (PCA)

In [70]:
# 제조 데이터 로드
production_tb = load_production()

from sklearn.decomposition import PCA

# 2개 주성분으로 PCA 객체 생성
pca = PCA(n_components=2)

# PCA 학습 및 변환
pca_values = pca.fit_transform(production_tb[['length', 'thickness']])

# 누적 기여율 확인
print('누적 기여율: {0}'.format(sum(pca.explained_variance_ratio_)))

# 각 차원의 기여율 확인
print('각 차원의 기여율: {0}'.format(pca.explained_variance_ratio_))

# 이미 학습된 PCA로 변환만 수행
pca_newvalues = pca.transform(production_tb[['length', 'thickness']])

누적 기여율: 1.0
각 차원의 기여율: [0.97897794 0.02102206]


#### 결측값 처리 기법

누락된 행 제거

In [99]:
# 제조 레코드 로드 (결측값 포함)
production_miss_num = load_production_missing_num()
production_miss_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   type       1000 non-null   object 
 1   length     1000 non-null   float64
 2   thickness  892 non-null    float64
 3   fault_flg  1000 non-null   bool   
dtypes: bool(1), float64(2), object(1)
memory usage: 24.5+ KB


In [100]:
production_miss_num.isna().sum()

type           0
length         0
thickness    108
fault_flg      0
dtype: int64

In [101]:
# None을 NaN(Not a Number)으로 교체
production_miss_num.replace('None', np.nan, inplace=True)
production_miss_num.isna().sum()

type           0
length         0
thickness    108
fault_flg      0
dtype: int64

In [None]:
# thickness 컬럼에서 NaN이 있는 레코드 제거
production_miss_num.dropna(subset=['thickness'], inplace=True)
production_miss_num.info()  # 892개로 108개 행이 제거됨 

<class 'pandas.core.frame.DataFrame'>
Index: 892 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   type       892 non-null    object 
 1   length     892 non-null    float64
 2   thickness  892 non-null    float64
 3   fault_flg  892 non-null    bool   
dtypes: bool(1), float64(2), object(1)
memory usage: 28.7+ KB


고정값 대체

In [103]:
# 제조 레코드 로드
production_miss_num = load_production_missing_num()
production_miss_num.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   type       1000 non-null   object 
 1   length     1000 non-null   float64
 2   thickness  892 non-null    float64
 3   fault_flg  1000 non-null   bool   
dtypes: bool(1), float64(2), object(1)
memory usage: 24.5+ KB


In [104]:
production_miss_num.isna().sum()

type           0
length         0
thickness    108
fault_flg      0
dtype: int64

In [105]:
# None을 NaN으로 교체
production_miss_num.replace('None', np.nan, inplace=True)

# thickness의 NaN 값을 1로 채움
production_miss_num['thickness'].fillna(1, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  production_miss_num['thickness'].fillna(1, inplace=True)


In [106]:
production_miss_num.isna().sum()

type         0
length       0
thickness    0
fault_flg    0
dtype: int64

In [None]:
(production_miss_num['thickness']==1).sum() # 결측치 108개가 1로 채워짐

np.int64(108)

평균값 대체

In [None]:
# 제조 레코드 로드
production_miss_num = load_production_missing_num()
production_miss_num.info()

In [109]:
# None을 NaN으로 교체
production_miss_num.replace('None', np.nan, inplace=True)

# 실수값으로 변경 (평균 계산을 위해)
production_miss_num['thickness'] = production_miss_num['thickness'].astype(float)

# 평균 구하기
thickness_mean = production_miss_num['thickness'].mean()
thickness_mean

np.float64(17.47558434455648)

In [110]:
# NaN을 평균값으로 교체
production_miss_num['thickness'].fillna(thickness_mean, inplace=True)
production_miss_num.isna().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  production_miss_num['thickness'].fillna(thickness_mean, inplace=True)


type         0
length       0
thickness    0
fault_flg    0
dtype: int64