# 준비

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')
plt.rc("axes", unicode_minus=False) # 음수값 깨지는 폰트 설정

In [None]:
train_data = pd.read_csv("Data/train.csv")
test_data = pd.read_csv("Data/test.csv")
submission = pd.read_csv("Data/sample_submission.csv")

In [None]:
train_data.shape, test_data.shape

In [None]:
train_data.head()

# EDA1

## target

In [None]:
f , axes = plt.subplots(1,3)
axes = axes.flatten()
f.set_size_inches(18,5)

# target 의 log_scale
train_data["log_target"] = np.log1p(train_data["target"])

sns.histplot(x="target", data=train_data, ax=axes[0])
axes[0].set(title = "target histogram")
sns.histplot(x="log_target", data=train_data, ax=axes[1])
axes[1].set(title = "log_target histogram")
sns.boxplot(x="log_target", data=train_data, ax=axes[2])
axes[2].set(title = "log_target boxplot")

## title

In [None]:
# train + test
all_data = pd.concat([train_data, test_data], axis=0, ignore_index=True)
print(all_data.shape)
# 자동차 회사 피쳐 생성
all_data["com"] = all_data["title"].map(lambda x: x.split()[0].upper())

# title 대문자
all_data["title"] = all_data["title"].apply(lambda x: x.upper())

In [None]:
all_data["title"].value_counts()

In [None]:
all_data["com"].unique()

In [None]:
all_data["com"].replace("MERCEDES-BENZ/52","MERCEDES-BENZ",inplace=True)

In [None]:
all_data.shape

## odometer

In [None]:
all_data[["odometer"]].boxplot(column="odometer")

In [None]:
all_data["log_odometer"]=np.log1p(all_data["odometer"])
all_data[["log_odometer"]].boxplot(column="log_odometer")

In [None]:
all_data["new"] = all_data["odometer"].copy()
all_data["new"][all_data["new"]!=0] = 1
all_data["new"][all_data["new"]==0] = 0

In [None]:
all_data["too_old"] = all_data["log_odometer"].copy()
all_data["too_old"][all_data["too_old"]<10] = 0
all_data["too_old"][all_data["too_old"]>=10] = 1

In [None]:
all_data.shape

## year

In [None]:
all_data[["year"]].head()

In [None]:
# year - 2020
all_data["old"] = all_data["year"]-2020
# 0 보다 큰 값은 0으로 수정
all_data["old"][all_data["old"]>0] = 0
# old 의 median = -10
all_data["old"][all_data["old"]<-100] = -10
# old 의  abs
all_data["old"] = abs(all_data["old"])

In [None]:
all_data[["old"]].head()

In [None]:
all_data.shape

## color

In [None]:
import re


def clean_text(texts):
    corpus = []
    for i in range(0, len(texts)):
        review = re.sub(
            r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>\<]',
            '', texts[i])  #@%*=()/+ 와 같은 문장부호 제거
        review = re.sub(r'\d+', '', review)  #숫자 제거
        review = review.lower()  #소문자 변환
        review = re.sub(r'\s+', ' ', review)  #extra space 제거
        review = re.sub(r'<[^>]+>', '', review)  #Html tags 제거
        review = re.sub(r'\s+', ' ', review)  #spaces 제거
        review = re.sub(r"^\s+", '', review)  #space from start 제거
        review = re.sub(r'\s+$', '', review)  #space from the end 제거
        review = re.sub(r'_', ' ', review)  #space from the end 제거
        #review = re.sub(r'l', '', review)
        corpus.append(review)

    return corpus

In [None]:
temp = clean_text(all_data['paint']) #메소드 적용
all_data['clean_paint'] = temp

In [None]:
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'blue' if x.find('blue') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'red' if x.find('red') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'green' if x.find('green') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'white' if x.find('white') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'grey' if x.find('grey') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'grey' if x.find('gery') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'grey' if x.find('gray') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'ash' if x.find('ash') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'brown' if x.find('brown') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'silver' if x.find('silver') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'silver' if x.find('sliver') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'black' if x.find('black') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'gold' if x.find('gold') >= 0 else x)
all_data['clean_paint'] = all_data['clean_paint'] = all_data['clean_paint'].apply(lambda x : 'wine' if x.find('whine') >= 0 else x)

In [None]:
all_data['clean_paint'].value_counts()

In [None]:
all_data[(all_data["clean_paint"] == "milk") |
         (all_data["clean_paint"] == "purple") |
         (all_data["clean_paint"] == "maroon") |
         (all_data["clean_paint"] == "beige")]

In [None]:
all_data[(all_data["clean_paint"] == "golf") |
         (all_data["clean_paint"] == "orange") |
         (all_data["clean_paint"] == "blac") |
         (all_data["clean_paint"] == "indigo ink pearl")]

In [None]:
def replacestringetc(data,x, counts):
    if sum(data == x) >counts:
        return x
    else:
        return "etc"

In [None]:
all_data["clean_paint"] = all_data["clean_paint"].map(lambda x: replacestringetc(all_data["clean_paint"], x, 5))

In [None]:
all_data['clean_paint'].value_counts()

In [None]:
all_data.shape

## engine

In [None]:
all_data["engine"].value_counts()

In [None]:
all_data[(all_data["engine"]=="12-cylinder(V12)")|(all_data["engine"]=="2-cylinder(I2)")|(all_data["engine"]=="4-cylinder(H4)")|(all_data["engine"]=="5-cylinder(I5)")|(all_data["engine"]=="3-cylinder(I3)")]

In [None]:
all_data["engine_type"] = all_data["engine"].copy()
all_data["cylinder"] = all_data["engine"].copy()
all_data["cylinder"] = all_data["cylinder"].map(lambda x: x.split("(")[1][:1])
all_data["engine_type"] = all_data["engine_type"].map(
    lambda x: x.split("(")[1][1:-1])

In [None]:
all_data["engine_type"].value_counts()

In [None]:
all_data["cylinder"].value_counts()

In [None]:
all_data.shape

## location

In [None]:
all_data["location"] = all_data["location"].map(lambda x: x.strip())

In [None]:
all_data["location"].value_counts()

In [None]:
all_data['location'] = all_data['location'].apply(lambda x : 'Abia' if x.find('Abia') >= 0 else x)
all_data['location'] = all_data['location'].apply(lambda x : 'Abuja' if x.find('Abuja') >= 0 else x)
all_data['location'] = all_data['location'].apply(lambda x : 'Lagos' if x.find('Lagos') >= 0 else x)
all_data['location'] = all_data['location'].apply(lambda x : 'Ogun' if x.find('Ogun') >= 0 else x)
all_data['location'] = all_data['location'].apply(lambda x : 'Ogun' if x.find('ogun') >= 0 else x)
all_data['location'] = all_data['location'].apply(lambda x : 'Lagos' if x.find('Mushin') >= 0 else x)

In [None]:
all_data["location"].value_counts()

In [None]:
all_data.shape

# 전처리2

In [None]:
temp = all_data[~all_data["target"].isnull()]

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(data=temp, x="location", y="target")

In [None]:
all_data["location"] = all_data["location"].map(lambda x: replacestringetc(all_data["location"], x, 2))

In [None]:
temp = all_data[~all_data["target"].isnull()]
sns.barplot(data=temp, x="location", y="target")

In [None]:
all_data.shape

## isimported

In [None]:
all_data["isimported"].value_counts()

In [None]:
sns.barplot(data=temp, x="isimported", y="target")

In [None]:
sns.barplot(data=all_data, x="isimported", y="odometer")

## transmission

In [None]:
all_data["transmission"].value_counts()

In [None]:
sns.barplot(data=temp, x="transmission", y="target")

## fuel

In [None]:
all_data["fuel"].value_counts()

In [None]:
sns.barplot(data=temp, x="fuel", y="target")

In [None]:
all_data.shape

## brand

In [None]:
plt.figure(figsize=(20,15))
sns.barplot(data=temp, x="com", y="target")

In [None]:
plt.figure(figsize=(20,15))
sns.barplot(data=temp, x="com", y="log_target")

In [None]:
pt = pd.pivot_table(data=temp, index=["com"], values=["target"], aggfunc=np.max).reset_index()
pt = pt.sort_values("target",ascending=False)
len(pt)

In [None]:
pt.tail()

In [None]:
plt.figure(figsize=(15,12))
sns.barplot(data=pt, x="com", y="target")

In [None]:
clurst = [1,2,2,2,3,3,4,4,4,4,4,4] + [5 for i in range(28)]
len(clurst)

In [None]:
pt["target_com_rank"] = clurst

In [None]:
all_data = pd.merge(left=all_data, right=pt, how="left", on="com")

In [None]:
# 결측값 확인후 5라고 줌
all_data.loc[all_data["target_com_rank"].isnull(),"target_com_rank"] = 5

In [None]:
all_data.shape

## color

In [None]:
all_data["paint"].unique()

In [None]:
all_data['paint'].map(lambda x: "".join(clean_text(x))).unique()

In [None]:
all_data["paint"] = all_data['paint'].map(lambda x: "".join(clean_text(x)))

In [None]:
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'blue' if x.find('blue') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'red' if x.find('red') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'green' if x.find('green') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('grey') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('gery') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('gray') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('ash') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'brown' if x.find('brown') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('silver') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('sliver') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'black' if x.find('black') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'gold' if x.find('gold') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'red' if x.find('whine') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'wine' if x.find('metal') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'red' if x.find('maroon') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'black' if x.find('blac') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'blue' if x.find('indigoinkpearl') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'black' if x.find('black') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'white' if x.find('white') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'gold' if x.find('golf') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'white' if x.find('milk') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'yellow' if x.find('orange') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'red' if x.find('wine') >= 0 else x)

In [None]:
all_data["paint"].value_counts()

In [None]:
temp = all_data[~all_data["target_x"].isnull()]

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(data=temp, x="clean_paint", y="target_x")

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(data=temp, x="paint", y="target_x")

In [None]:
# 10개 이하 etc 처리
all_data["paint_etc"] = all_data["paint"].map(lambda x : replacestringetc(all_data["paint"], x, 10))

In [None]:
# 무채색 / 유채색
all_data["BWG"] = all_data["paint_etc"].copy()

In [None]:
def paint_bwg(x):
    if x in ["black", "grey", "white"]:
        return 1
    else:
        return 0

In [None]:
all_data["BWG"] = all_data["BWG"].map(lambda x: paint_bwg(x))

In [None]:
all_data["BWG"].value_counts()

In [None]:
all_data.shape

* 분기점
    1. clean_paint 를 쓸 때
    2. paint 를 쓸 때
    3. paint_etc 를 쓸 때

# drop

완전히 필요 없는 피쳐 제거

In [None]:
all_data.head()

In [None]:
drop_feature = ["id", "engine", "year", "target_y"]
all_data = all_data.drop(drop_feature, axis=1)

# 데이터 정리

In [None]:
all_data["too_old"] = all_data["too_old"].astype(int)
all_data["target_com_rank"] = all_data["target_com_rank"].astype(int)

In [None]:
all_data = all_data.rename(columns={"target_x":"target", "new":"use"})

# 저장

In [None]:
all_data.to_csv("preprocessing_done.csv", index=False)

In [None]:
all_data = pd.read_csv("preprocessing_done.csv")

# 데이터 분기점

In [None]:
list(all_data)

In [None]:
# [
#     'title', 'odometer', 'location', 'isimported', 'transmission', 'fuel',
#     'paint', 'target', 'log_target', 'com', 'log_odometer', 'use', 'too_old',
#     'old', 'clean_paint', 'engine_type', 'cylinder', 'target_com_rank',
#     'paint_etc'
# ]
# 가져갈 피쳐 체크

all_data = all_data[[
 'title',
 'odometer',
 'location',
 'isimported',
 'transmission',
 'fuel',
#  'paint',
#  'target',
 'log_target',
 'com',
 'log_odometer',
 'use',
#  'too_old',
 'old',
#  'clean_paint',
 'engine_type',
 'cylinder',
 'target_com_rank',
 'paint_etc',
 'BWG'
]]

train = all_data[~all_data["log_target"].isnull()].reset_index().drop("index", axis=1)
test = all_data[all_data["log_target"].isnull()].reset_index().drop("index", axis=1).drop("log_target", axis=1)

In [None]:
train.shape, train_data.shape

In [None]:
test.shape, test_data.shape

In [None]:
sum(test.odometer - test_data.odometer)

In [None]:
train.head(2)

In [None]:
test.head(2)

# 평가지표 작성

In [None]:
def nmae(answer, pred):
    mae = np.mean(np.abs(answer-pred))
    score = mae / np.mean(np.abs(answer))
    return score

# 모델링

In [None]:
from pycaret.regression import *

In [None]:
data = setup(
   data=train,
   target='log_target',
   train_size= 0.85,
   normalize=True,
   normalize_method ='minmax',
   remove_perfect_collinearity=False,
   fold=10,
)

In [None]:
add_metric("nmae","NMAE",nmae, False)

In [None]:
compare_models(sort="NMAE")

In [None]:
blender = blend_models(estimator_list=compare_models(n_select=2, sort="NMAE"))

In [None]:
final = finalize_model(blender)

In [None]:
y_pred = predict_model(final, data=test)

In [None]:
y_pred = y_pred.reset_index()

In [None]:
y_pred_exp = np.expm1(y_pred["Label"])

In [None]:
submission["target"] = y_pred_exp

In [None]:
submission.to_csv("submission_14.csv", index=False)

In [None]:
pd.read_csv("submission_14.csv")