# 문제 정의
- 항공권 티켓 가격을 예측하시오.
 - 제공된 데이터 목록: flight_train.csv, flight_test.csv
 - 예측할 컬럼: price

- 학습용 데이터(train)를 이용해 티켓 가격을 예측하는 모델을 만든 후 이를 평가용 데이터(test)에 적용해 얻은 예측값을 다음과 같은 형식의 CSV 파일로 생성하시오.

제출 파일은 다음 1개의 컬럼을 포함해야 한다.
- pred: 예측값(가격)
- 제출 파일명: 'result.csv'
제출한 모델의 성능은 RMSE 평가지표에 따라 채점한다

In [50]:
# 파일 업로드
from google.colab import files
uploads = files.upload()

Saving flight_train.csv to flight_train (1).csv
Saving flight_test.csv to flight_test (1).csv


In [51]:
# 데이터 불러오기
import pandas as pd
train = pd.read_csv('flight_train.csv')
test = pd.read_csv('flight_test.csv')

In [52]:
# EDA
print('===== 데이터 사이즈 =====')
print('train:', train.shape, 'test:', test.shape)

print('\n===== 자료형 =====')
print(train.info())

print('\n===== 기술통계량(object) =====')
print(train.describe(include='O'))
print(test.describe(include='O'))

print('\n===== 기술통계량(float/int) =====')
print(train.describe())
print(test.describe())

print('\n ===== 결측치(train) =====')
print(train.isnull().sum())

print('\n===== 결측치(test) =====')
print(test.isnull().sum())

===== 데이터 사이즈 =====
train: (10505, 11) test: (4502, 10)

===== 자료형 =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10505 entries, 0 to 10504
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           10505 non-null  object 
 1   flight            10505 non-null  object 
 2   source_city       10505 non-null  object 
 3   departure_time    10505 non-null  object 
 4   stops             10505 non-null  object 
 5   arrival_time      10505 non-null  object 
 6   destination_city  10505 non-null  object 
 7   class             10505 non-null  object 
 8   duration          10505 non-null  float64
 9   days_left         10505 non-null  int64  
 10  price             10505 non-null  int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 902.9+ KB
None

===== 기술통계량(object) =====
        airline  flight source_city departure_time  stops arrival_time  \
count     10505   10505       10505  

In [53]:
# 데이터 전처리
# 카테고리 다른 flight는 삭제
train = train.drop('flight', axis=1)
test = test.drop('flight', axis=1)

In [54]:
# target 분리
target = train.pop('price')
target.shape

(10505,)

In [55]:
# 레이블 인코딩
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
cols = train.select_dtypes('object').columns
for col in cols:
  train[col] = le.fit_transform(train[col])
  test[col] = le.transform(test[col])
train.info(), test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10505 entries, 0 to 10504
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           10505 non-null  int64  
 1   source_city       10505 non-null  int64  
 2   departure_time    10505 non-null  int64  
 3   stops             10505 non-null  int64  
 4   arrival_time      10505 non-null  int64  
 5   destination_city  10505 non-null  int64  
 6   class             10505 non-null  int64  
 7   duration          10505 non-null  float64
 8   days_left         10505 non-null  int64  
dtypes: float64(1), int64(8)
memory usage: 738.8 KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4502 entries, 0 to 4501
Data columns (total 9 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   airline           4502 non-null   int64  
 1   source_city       4502 non-null   int64  
 2   departure_time    4502 n

(None, None)

In [56]:
# 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state = 0
)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((8404, 9), (2101, 9), (8404,), (2101,))

In [57]:
# 머신러닝 학습
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

In [58]:
# 평가지표 확인 RMSE
from sklearn.metrics import root_mean_squared_error
rmse = root_mean_squared_error(y_val, pred)
rmse

4431.290931404485

In [59]:
# # lightgbm
# import lightgbm as lgb
# lgbmr = lgb.LGBMRegressor(random_state=0, verbose=-1)
# lgbmr.fit(X_train, y_train)
# pred = lgbmr.predict(X_val)

In [60]:
# # 평가지표 확인
# rmse = root_mean_squared_error(y_val,pred)
# rmse

# 성능 개선

In [61]:
# 데이터 초기화
train = pd.read_csv('flight_train.csv')
test = pd.read_csv('flight_test.csv')

In [62]:
# flight 일부 사용
train['f2'] = train['flight'].str.split('-').str[1].astype('int')
test['f2'] = test['flight'].str.split('-').str[1].astype('int')

In [63]:
# flight 삭제
train.drop('flight', axis=1, inplace=True)
test.drop('flight', axis=1, inplace = True)

In [64]:
# 학습 및 평가 반복
target = train.pop('price')
cols = train.select_dtypes('object').columns
for col in cols:
  train[col] = le.fit_transform(train[col])
  test[col] = le.transform(test[col])

X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state = 0
)

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

rmse = root_mean_squared_error(y_val, pred)
rmse

3785.632095912173

In [65]:
# 스케일링 비교 위한 copy데이터 함수 설정
def get_data():
  train_copy = train.copy()
  test_copy = test.copy()
  return train_copy, test_copy

In [66]:
# # 스케일링(MinMaxScaler)
# train_copy, test_copy = get_data()
# cols = ['duration', 'days_left']
# from sklearn.preprocessing import MinMaxScaler
# scaler = MinMaxScaler()
# train_copy[cols] = scaler.fit_transform(train_copy[cols])
# test_copy[cols] = scaler.transform(test_copy[cols])

# X_train, X_val, y_train, y_val = train_test_split(
#     train_copy,
#     target,
#     test_size = 0.2,
#     random_state = 0
# )

# rf = RandomForestRegressor(random_state=0)
# rf.fit(X_train, y_train)
# pred = rf.predict(X_val)

# rmse = root_mean_squared_error(y_val, pred)
# rmse

3782.020252728955

In [67]:
# # 스케일링(RobustScaler)
# train_copy, test_copy = get_data()
# cols = ['duration', 'days_left']
# from sklearn.preprocessing import RobustScaler
# scaler = RobustScaler()
# train_copy[cols] = scaler.fit_transform(train_copy[cols])
# test_copy[cols] = scaler.transform(test_copy[cols])

# X_train, X_val, y_train, y_val = train_test_split(
#     train_copy,
#     target,
#     test_size = 0.2,
#     random_state = 0
# )

# rf = RandomForestRegressor(random_state=0)
# rf.fit(X_train, y_train)
# pred = rf.predict(X_val)

# rmse = root_mean_squared_error(y_val, pred)
# rmse

3784.979024627084

In [70]:
# 스케일링(StandardScaler)
# train_copy, test_copy = get_data()
cols = ['duration', 'days_left']
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train[cols] = scaler.fit_transform(train[cols])
test[cols] = scaler.transform(test[cols])

X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state = 0
)

rf = RandomForestRegressor(random_state=0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

rmse = root_mean_squared_error(y_val, pred)
rmse

3778.904216923134

In [78]:
# 하이퍼파라미터 조정
# depths = [3,5,7,9]
estimators = [200,300]
# best_depth = None
best_estimators = None
best_RMSE = 10000

# for depth in depths:
for estimator in estimators:
    rf = RandomForestRegressor(random_state=0, n_estimators = estimator)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_val)
    rmse = root_mean_squared_error(y_val, pred)
    print('rmse:', rmse, 'n_estimators:', estimator)
    if rmse < best_RMSE:
      best_RMSE = rmse
      # best_depth = depth
      best_estimators = estimator

print('best_RMSE:', best_RMSE, 'n_estimators:', best_estimators)

rmse: 3750.04304842664 n_estimators: 200
rmse: 3750.709396768093 n_estimators: 300
best_RMSE: 3750.04304842664 n_estimators: 200


In [79]:
# 최종 모델 선정
model = RandomForestRegressor(random_state=0, n_estimators = 200)
model.fit(X_train, y_train)
pred = model.predict(test)
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index=False)
pd.read_csv('result.csv')

Unnamed: 0,pred
0,60786.250
1,5128.470
2,12616.680
3,5955.830
4,4936.365
...,...
4497,13760.620
4498,4176.085
4499,24954.885
4500,17751.395
