# 수치예측 - 중고차 판매 가격

중고차 판매 데이터 활용
- Dubizzle used car sales data 데이터 활용
- 출처 : [Dubizzle used car sales data](https://www.kaggle.com/datasets/alihassankp/dubizzle-used-car-sale-data)
- 총 20개 컬럼 중 Target 컬럼은 `price_in_aed`

---

# Import Libraries & Load data

In [None]:
# Visual Python: Data Analysis > Import
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

In [None]:
# Visual Python: Data Analysis > File
df = pd.read_csv('./data/Dubizzle_used_car_sales.csv')
df

In [None]:
# Visual Python: Library > columns
df.columns

In [None]:
# 타겟 컬럼
target_col = 'price_in_aed'

# EDA & Data Preprocessing

#### Q. 데이터의 상위 5개 행을 출력하세요.

In [None]:
# Visual Python: Data Analysis > Data Info
df.head()

#### Q. 각 컬럼별 데이터 타입과 데이터 개수를 확인하세요.

In [None]:
# Visual Python: Data Analysis > Data Info
df.info()

#### Q. `title`, `date_posted` 컬럼을 삭제하세요.

In [None]:
# Visual Python: Data Analysis > Frame
df.drop(['title'], axis=1, inplace=True)
df.drop(['date_posted'], axis=1, inplace=True)
df

#### Q. 각 컬럼의 결측치 수를 확인하세요.

In [None]:
# Visual Python: Data Analysis > Data Info
pd.DataFrame({'Null Count': df.isnull().sum(), 'Non-Null Count': df.notnull().sum()})

#### Q. 결측치가 있는 행을 제거하세요.

In [None]:
# Visual Python: Data Analysis > Frame
df.dropna(axis=0, subset=['no_of_cylinders','year','motors_trim'], how='any', inplace=True)
df

#### Q. `no_of_cylinders` 컬럼의 'None', 'Unknown' 행을 삭제하세요

In [None]:
# Visual Python: Data Analysis > Subset
df = df.loc[(df['no_of_cylinders'] != 'None')&(df['no_of_cylinders'] != 'Unknown'), :]
df

#### Q. `no_of_cylinders` 컬럼의 타입을 int로 바꾸세요.

In [None]:
# Visual Python: Data Analysis > Frame
df = df.astype({'no_of_cylinders': 'int64'})
df

#### Q. 수치형 컬럼들의 통계값을 출력하세요.
- 수치형 컬럼: `price_in_aed`, `kilometers`, `no_of_cylinders`, `year`

In [None]:
# Visual Python: Data Analysis > Data Info
df[['price_in_aed', 'kilometers', 'no_of_cylinders', 'year']].describe()

#### Q. 수치형 컬럼간 상관계수를 확인하시오.
- 수치형 컬럼: `price_in_aed`, `kilometers`, `no_of_cylinders`, `year`

In [None]:
# Visual Python: Data Analysis > Data Info
df[['price_in_aed', 'kilometers', 'no_of_cylinders', 'year']].corr(numeric_only=True)

#### Q. 모든 범주형 컬럼을 라벨 인코딩한 후 원본 컬럼을 삭제하세요.

In [None]:
# Visual Python: Data Analysis > Frame
df['body_condition_label'] = pd.Categorical(df['body_condition']).codes
df['mechanical_condition_label'] = pd.Categorical(df['mechanical_condition']).codes
df['seller_type_label'] = pd.Categorical(df['seller_type']).codes
df['body_type_label'] = pd.Categorical(df['body_type']).codes
df['transmission_type_label'] = pd.Categorical(df['transmission_type']).codes
df['regional_specs_label'] = pd.Categorical(df['regional_specs']).codes
df['horsepower_label'] = pd.Categorical(df['horsepower']).codes
df['fuel_type_label'] = pd.Categorical(df['fuel_type']).codes
df['steering_side_label'] = pd.Categorical(df['steering_side']).codes
df['color_label'] = pd.Categorical(df['color']).codes
df['emirate_label'] = pd.Categorical(df['emirate']).codes
df['motors_trim_label'] = pd.Categorical(df['motors_trim']).codes
df['company_label'] = pd.Categorical(df['company']).codes
df['model_label'] = pd.Categorical(df['model']).codes
df.drop(['body_condition','mechanical_condition','seller_type','body_type','transmission_type','regional_specs','horsepower','fuel_type','steering_side','color','emirate','motors_trim','company','model'], axis=1, inplace=True)
df

# RandomForestClassifier 모델로 분류하기

#### Q. `df`를 이용해 데이터셋을 train, test로 분리해주세요.

In [None]:
# Visual Python: Machine Learning > Data Split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df[['kilometers', 'no_of_cylinders', 'year', 'body_condition_label', 'mechanical_condition_label', 'seller_type_label', 'body_type_label', 'transmission_type_label', 'regional_specs_label', 'horsepower_label', 'fuel_type_label', 'steering_side_label', 'color_label', 'emirate_label', 'motors_trim_label', 'company_label', 'model_label']], df['price_in_aed'])

#### Q. RandomForestClassifier 모델을 생성하고, fit으로 학습시킨 후 예측 결과를 `pred`에 저장하세요.

In [None]:
# Visual Python: Machine Learning > Regressor
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor()

In [None]:
# Visual Python: Machine Learning > Fit/Predict
rf_model.fit(X_train, y_train)

In [None]:
# Visual Python: Machine Learning > Fit/Predict
pred = rf_model.predict(X_test)
pred

#### Q. 예측결과인 `pred`를 평가해 R squared 와 RMSE를 확인하세요.
- Visual Python: Machine Learning > Evaluation

In [None]:
# Visual Python: Machine Learning > Evaluation
from sklearn import metrics

In [None]:
# Visual Python: Machine Learning > Evaluation
from IPython.display import display, Markdown

In [None]:
# Visual Python: Machine Learning > Evaluation
# R square
print('R square: {}'.format(metrics.r2_score(y_test, pred)))

In [None]:
# Visual Python: Machine Learning > Evaluation
# RMSE(Root Mean Squared Error)
print('RMSE: {}'.format(metrics.mean_squared_error(y_test, pred)**0.5))

In [None]:
# Visual Python: Machine Learning > Evaluation
# Regression plot
display(Markdown('### Regression plot'))
plt.scatter(y_test, pred)
plt.xlabel('y_test')
plt.ylabel('pred')
plt.show()

#### Q. Feature Importance를 차트로 그리세요.

In [None]:
# Visual Python: Machine Learning > Model Info
def vp_create_feature_importances(model, X_train=None, sort=False):
    if isinstance(X_train, pd.core.frame.DataFrame):
        feature_names = X_train.columns
    else:
        feature_names = [ 'X{}'.format(i) for i in range(len(model.feature_importances_)) ]
                        
    df_i = pd.DataFrame(model.feature_importances_, index=feature_names, columns=['Feature_importance'])
    df_i['Percentage'] = 100 * df_i['Feature_importance']
    if sort: df_i.sort_values(by='Feature_importance', ascending=False, inplace=True)
    df_i = df_i.round(2)
                        
    return df_i
def vp_plot_feature_importances(model, X_train=None, sort=False, top_count=0):
    df_i = vp_create_feature_importances(model, X_train, sort)
                        
    if sort: 
        if top_count > 0:
            df_i['Percentage'].sort_values().tail(top_count).plot(kind='barh')
        else:
            df_i['Percentage'].sort_values().plot(kind='barh')
    else: 
        df_i['Percentage'].plot(kind='barh')
    plt.xlabel('Feature importance Percentage')
    plt.ylabel('Features')
                        
    plt.show()

In [None]:
# Visual Python: Machine Learning > Model Info
vp_plot_feature_importances(rf_model, X_train, sort=True, top_count=10)

---

In [None]:
# End of file