# 문제 정의
- 노트북 정보로 가격을 예측하시오.
 - 제공된 데이터 목록: labtop_train.csv, laptop_test.csv
 - 예측할 컬럼: price

- 학습용 데이터(train)를 이용해 노트북 가격을 예측하는 모델을 만든 후 이를 평가용 데이터(test)에 적용해 얻은 예측값을 다음과 같은 형식의 CSV 파일로 생성하시오.

제출 파일은 다음 1개의 컬럼을 포함해야 한다.
 - pred: 예측값(가격)
 - 제출 파일명: 'result.csv'
제출한 모델의 성능은 R2(결정 계수) 평가지표에 따라 채점한다.

In [2]:
# 파일 업로드
from google.colab import files
uploads = files.upload()

Saving laptop_train.csv to laptop_train.csv
Saving laptop_test.csv to laptop_test.csv


In [4]:
# 데이터 불러오기
import pandas as pd
train = pd.read_csv('laptop_train.csv')
test = pd.read_csv('laptop_test.csv')
train.head()

Unnamed: 0,Brand,Model,Series,Processor,Processor_Gen,RAM,Hard_Disk_Capacity,OS,Rating,Price
0,ASUS,VivoBook,15,i3,10th,8.0,512 GB SSD,Windows 11 Home,4.3,37940
1,DELL,Inspiron,,i3,11th,8.0,1 TB HDD,Windows 11 Home,3.7,39040
2,ASUS,VivoBook,15,i7,10th,16.0,512 GB SSD,Windows 11 Home,4.1,57940
3,DELL,,,i3,10th,8.0,1 TB HDD,Windows 10,3.2,41340
4,Lenovo,IdeaPad,Slim,i3,11th,8.0,512 GB SSD,Windows 10 Home,4.4,45440


In [6]:
# 탐색적 데이터 분석
print('===== DATA SIZE =====')
print(train.shape, test.shape)

print('\n===== DATA TYPE =====')
print(train.info())

print('\n===== object(train) =====')
print(train.describe(include='O'))

print('\n===== object(test) =====')
print(test.describe(include='O'))

print('\n===== int/float(train) =====')
print(train.describe())

print('\n===== int/float(test) =====')
print(test.describe())

print('\n===== Missing Value(train) =====')
print(train.isnull().sum())

print('\===== Missing Value(test) =====')
print(test.isnull().sum())

===== DATA SIZE =====
(91, 10) (39, 9)

===== DATA TYPE =====
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91 entries, 0 to 90
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Brand               91 non-null     object 
 1   Model               82 non-null     object 
 2   Series              55 non-null     object 
 3   Processor           86 non-null     object 
 4   Processor_Gen       86 non-null     object 
 5   RAM                 85 non-null     float64
 6   Hard_Disk_Capacity  85 non-null     object 
 7   OS                  85 non-null     object 
 8   Rating              91 non-null     float64
 9   Price               91 non-null     int64  
dtypes: float64(2), int64(1), object(7)
memory usage: 7.2+ KB
None

===== object(train) =====
       Brand     Model Series Processor Processor_Gen Hard_Disk_Capacity  \
count     91        82     55        86            86                 85   
u

In [7]:
# target 분리
target = train.pop('Price')

In [10]:
# 결측치 채우기(object)
cols_o = train.select_dtypes('object').columns
train[cols_o] = train[cols_o].fillna('Missing_Value')
test[cols_o] = test[cols_o].fillna('Missing_Value')

#결측치 채우기(int/float)
cols_n = ['RAM', 'Rating']
train[cols_n] = train[cols_n].fillna(-1)
test[cols_n] = test[cols_n].fillna(-1)

train.isnull().sum(), test.isnull().sum()

(Brand                 0
 Model                 0
 Series                0
 Processor             0
 Processor_Gen         0
 RAM                   0
 Hard_Disk_Capacity    0
 OS                    0
 Rating                0
 dtype: int64,
 Brand                 0
 Model                 0
 Series                0
 Processor             0
 Processor_Gen         0
 RAM                   0
 Hard_Disk_Capacity    0
 OS                    0
 Rating                0
 dtype: int64)

In [11]:
# 인코딩(원핫)
df = pd.concat([train,test])
df_dummies = pd.get_dummies(df)
train = df_dummies[:len(train)]
test = df_dummies[len(train):]

In [13]:
# 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train,
    target,
    test_size = 0.2,
    random_state = 0
)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((72, 119), (19, 119), (72,), (19,))

In [14]:
# 모델 학습(RandomForest)
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

# 모델 평가(r2)

from sklearn.metrics import r2_score
r2 = r2_score(y_val, pred)
r2

0.7540119506559414

In [15]:
# # 모델 학습(lightGBM)
# import lightgbm as lgb
# lgbmr = lgb.LGBMRegressor(random_state=0, verbose = -1)
# lgbmr.fit(X_train, y_train)
# pred = lgbmr.predict(X_val)

# # 모델 평가(r2)

# r2 = r2_score(y_val, pred)
# r2

0.36199992800967373

# 성능 개선

In [18]:
# 데이터 불러오기
train = pd.read_csv('laptop_train.csv')
test = pd.read_csv('laptop_test.csv')

In [20]:
# 결측치 높은 series 컬럼 삭제
train.drop('Series', axis=1, inplace= True)
test.drop('Series', axis=1,inplace = True)
train.shape,test.shape

((91, 9), (39, 8))

In [21]:
# target 추출
target = train.pop('Price')

In [22]:
# copy 본으로 진행
train_copy, test_copy = train.copy(), test.copy()

In [24]:
# 원핫 인코딩
df = pd.concat([train_copy, test_copy])
df_dummies = pd.get_dummies(df)
train_copy = df_dummies[:len(train_copy)]
test_copy = df_dummies[len(train_copy):]

In [25]:
# 모델 학습 및 평가

# 데이터 분리
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(
    train_copy,
    target,
    test_size = 0.2,
    random_state = 0
)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state = 0)
rf.fit(X_train, y_train)
pred = rf.predict(X_val)

# 모델 평가(r2)

from sklearn.metrics import r2_score
r2 = r2_score(y_val, pred)
r2

0.7841709724749455

In [26]:
# train test 확정
train = train_copy
test = test_copy

In [34]:
# HyperParameter 조정
depths = [5,7,8]
# n_estimators = [200,300,400,500]
best_depth = None
# best_n = None
best_r2 = 0

for depth in depths:
# for n_estimator in n_estimators:
    rf = RandomForestRegressor(random_state=0, max_depth = depth)
    rf.fit(X_train, y_train)
    pred = rf.predict(X_val)
    r2 = r2_score(y_val, pred)
    print('r2:', r2, 'depth:', depth)
    if r2 > best_r2:
      best_r2 = r2
      best_depth = depth
      # best_n = n_estimator
print('best r2:',best_r2, 'best_depth:',best_depth)

r2: 0.7902952301456462 depth: 5
r2: 0.7960263706473234 depth: 7
r2: 0.7986429862418141 depth: 8
best r2: 0.7986429862418141 best_depth: 8


In [37]:
# model 선정
model = RandomForestRegressor(random_state=0, max_depth = 8)
model.fit(X_train, y_train)
pred = model.predict(test)
pred

array([40101.17692867, 40149.02883941, 59575.3619601 , 40068.23251726,
       43953.41246032, 55830.96173336, 63091.86694911, 60019.57210638,
       67119.34367117, 32854.46122294, 51548.15544444, 98431.12398268,
       43490.68452487, 60307.8576501 , 79980.44675824, 50426.97395083,
       55922.2771949 , 52477.0336501 , 39294.61536763, 39294.61536763,
       40216.72861709, 37233.18368463, 62142.50028899, 41049.67965885,
       41017.38974259, 54521.17993215, 67950.04265618, 44050.19844444,
       34355.22821439, 39294.61536763, 40329.33418393, 71264.21401355,
       71406.76948951, 49997.28494708, 46679.57849274, 55981.70077312,
       98431.12398268, 62073.225086  , 40134.76778068])

In [39]:
# 파일 제출
submit = pd.DataFrame({'pred':pred})
submit.to_csv('result.csv', index = False)
pd.read_csv('result.csv')

Unnamed: 0,pred
0,40101.176929
1,40149.028839
2,59575.36196
3,40068.232517
4,43953.41246
5,55830.961733
6,63091.866949
7,60019.572106
8,67119.343671
9,32854.461223
