# 준비

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')
plt.rc("font", family='Noto Sans KR', size=9) # 한글 폰트 설정
plt.rc("axes", unicode_minus=False) # 음수값 깨지는 폰트 설정
pd.options.display.max_columns = 50 # 칼럼수 50개까지

In [2]:
from sklearn.preprocessing import MinMaxScaler

def replacestringetc(data,x, counts):
    if sum(data == x) >counts:
        return x
    else:
        return "etc"

In [3]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")
all_data = pd.concat([train_data, test_data], ignore_index=True)

In [4]:
all_data

Unnamed: 0,id,title,odometer,location,isimported,engine,transmission,fuel,paint,year,target
0,0,Toyota RAV 4,18277,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Red,2016,13665000.0
1,1,Toyota Land Cruiser,10,Lagos,New,4-cylinder(I4),automatic,petrol,Black,2019,33015000.0
2,2,Land Rover Range Rover Evoque,83091,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2012,9915000.0
3,3,Lexus ES 350,91524,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Gray,2007,3815000.0
4,4,Toyota Venza,94177,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2010,7385000.0
...,...,...,...,...,...,...,...,...,...,...,...
1446,431,Mercedes-Benz GLK 350,78175,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Dark Blue,2014,
1447,432,Honda Crosstour,129223,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,Red,2011,
1448,433,Mercedes-Benz ML 350,100943,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Black,2013,
1449,434,Lexus GX 470,81463,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,Mint green,2003,


In [5]:
# 평가산식
from sklearn.metrics import make_scorer

def nmae(answer, pred):
    mae = np.mean(np.abs(answer-pred))
    score = mae / np.mean(np.abs(answer))
    return score

NMAE = make_scorer(nmae, greater_is_better=False)

def at_NMAE(y_pred, dataset):
    y_true = dataset.get_label()
    
    mae = np.mean(np.abs(y_true-y_pred))
    score = mae / np.mean(np.abs(y_true))

    return "score", score, False

# 전처리

## target

In [6]:
# target 로그 스케일 
all_data["log_target"] = np.log(all_data["target"])

## title

In [7]:
# 자동차 회사
all_data["com"] = all_data["title"].map(lambda x: x.split()[0].upper())

In [8]:
# 같은 회사인데 이름이 다른것 있음/ 수정
all_data["com"].unique()

array(['TOYOTA', 'LAND', 'LEXUS', 'PONTIAC', 'FORD', 'HONDA', 'VOLVO',
       'BMW', 'INFINITI', 'MERCEDES-BENZ', 'HYUNDAI', 'JAGUAR',
       'MITSUBISHI', 'NISSAN', 'MACK', 'CHEVROLET', 'MAZDA', 'LINCOLN',
       'KIA', 'VOLKSWAGEN', 'ACURA', 'DAF', 'MAN', 'ISUZU', 'IVM',
       'PORSCHE', 'MINI', 'DODGE', 'GMC', 'IVECO', 'SCANIA', 'GAC',
       'PEUGEOT', 'ROLLS-ROYCE', 'MAN-VOLKSWAGEN', 'JEEP', 'ALPINA',
       'BENTLEY', 'JMC', 'MERCEDES-BENZ/52', 'AUDI', 'FIAT'], dtype=object)

In [9]:
all_data["com"].replace("VOLKSWAGEN","MAN-VOLKSWAGEN",inplace=True)
all_data["com"].replace("MAN","MAN-VOLKSWAGEN",inplace=True)
all_data["com"].replace("MERCEDES-BENZ/52","MERCEDES-BENZ",inplace=True)

## year

In [10]:
all_data["new"] = all_data["odometer"].copy()
all_data["new"][all_data["new"]!=0] = 1
all_data["new"][all_data["new"]==0] = 0
all_data["log_odometer"] = all_data["odometer"].copy()
all_data["log_odometer"][all_data["log_odometer"]==0]=10
all_data["log_odometer"] = np.log(all_data["log_odometer"])
all_data["too_old"] = all_data["log_odometer"].copy()
all_data["too_old"][all_data["too_old"]<10] = 0
all_data["too_old"][all_data["too_old"]>=10] = 1

In [11]:
# year - 2020
all_data["old"] = all_data["year"]-2020
# 0 보다 큰 값은 0으로 수정
all_data["old"][all_data["old"]>0] = 0
# old 의 median = -10
all_data["old"][all_data["old"]<-100] = -10
# old 의  abs
all_data["old"] = abs(all_data["old"])

## color

In [12]:
import re 

def clean_text(texts): 
    corpus = [] 
    for i in range(0, len(texts)): 
        review = re.sub(r'[@%\\*=()/~#&\+á?\xc3\xa1\-\|\.\:\;\!\-\,\_\~\$\'\"\n\]\[\>\<]', '',texts[i]) #@%*=()/+ 와 같은 문장부호 제거
        review = re.sub(r'\d+','',review)#숫자 제거
        review = review.lower() #소문자 변환
        review = re.sub(r'\s+', ' ', review) #extra space 제거
        review = re.sub(r'<[^>]+>','',review) #Html tags 제거
        review = re.sub(r'\s+', ' ', review) #spaces 제거
        review = re.sub(r"^\s+", '', review) #space from start 제거
        review = re.sub(r'\s+$', '', review) #space from the end 제거
        review = re.sub(r'_', ' ', review) #space from the end 제거
        #review = re.sub(r'l', '', review)
        corpus.append(review) 
        
    return corpus

In [13]:
temp = clean_text(all_data['paint']) #메소드 적용
all_data['paint'] = temp

In [14]:
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'blue' if x.find('blue') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'red' if x.find('red') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'green' if x.find('green') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'white' if x.find('white') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('grey') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('gery') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'grey' if x.find('gray') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'ash' if x.find('ash') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'brown' if x.find('brown') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'silver' if x.find('silver') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'silver' if x.find('sliver') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'black' if x.find('black') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'gold' if x.find('gold') >= 0 else x)
all_data['paint'] = all_data['paint'] = all_data['paint'].apply(lambda x : 'wine' if x.find('whine') >= 0 else x)

In [15]:
all_data['paint'].value_counts()

black               436
grey                221
white               209
silver              184
blue                142
red                  96
gold                 53
green                38
brown                23
wine                 12
ash                  11
yellow                6
cream                 6
purple                3
maroon                3
milk                  2
beige                 2
orange                1
blac                  1
golf                  1
indigo ink pearl      1
Name: paint, dtype: int64

In [135]:
all_data["paint"] = all_data["paint"].map(lambda x: replacestringetc(all_data["paint"], x, 5))

In [136]:
all_data['paint'].value_counts()

black     436
grey      221
white     209
silver    184
blue      142
red        96
gold       53
green      38
brown      23
etc        14
wine       12
ash        11
cream       6
yellow      6
Name: paint, dtype: int64

## engine

In [137]:
all_data["engine_type"] = all_data["engine"].copy()
all_data["cylinder"] = all_data["engine"].copy()
all_data["cylinder"] = all_data["cylinder"].map(lambda x: x[-2])
all_data["engine_type"] = all_data["engine_type"].map(lambda x: x[-3])

## location

In [16]:
all_data["location"] = all_data["location"].map(lambda x: replacestringetc(all_data["location"],x,5))

## drop

In [138]:
all_data.head(3)

Unnamed: 0,id,title,odometer,location,isimported,engine,transmission,fuel,paint,year,target,log_target,com,new,log_odometer,too_old,old,engine_type,cylinder
0,0,Toyota RAV 4,18277,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,red,2016,13665000.0,16.430348,TOYOTA,1,9.813399,0.0,4,I,4
1,1,Toyota Land Cruiser,10,Lagos,New,4-cylinder(I4),automatic,petrol,black,2019,33015000.0,17.312473,TOYOTA,1,2.302585,0.0,1,I,4
2,2,Land Rover Range Rover Evoque,83091,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,red,2012,9915000.0,16.109559,LAND,1,11.327692,1.0,8,V,6


In [139]:
drop_feature = ["id","year","target","log_odometer"]
all_data = all_data.drop(drop_feature, axis=1)

In [142]:
all_data.tail(3)

Unnamed: 0,title,odometer,location,isimported,engine,transmission,fuel,paint,log_target,com,new,too_old,old,engine_type,cylinder
1448,Mercedes-Benz ML 350,100943,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,black,,MERCEDES-BENZ,1,1.0,7,I,4
1449,Lexus GX 470,81463,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,green,,LEXUS,1,1.0,17,I,4
1450,Toyota Sienna,646,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,silver,,TOYOTA,1,0.0,14,V,6


In [21]:
from pycaret.regression import *

In [143]:
train

Unnamed: 0,title,odometer,location,isimported,engine,transmission,fuel,paint,log_target,com,new,too_old,old,engine_type,cylinder
0,Toyota RAV 4,18277,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,red,16.430348,TOYOTA,1,0.0,4,I,4
1,Toyota Land Cruiser,10,Lagos,New,4-cylinder(I4),automatic,petrol,black,17.312473,TOYOTA,1,0.0,1,I,4
2,Land Rover Range Rover Evoque,83091,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,red,16.109559,LAND,1,1.0,8,V,6
3,Lexus ES 350,91524,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,grey,15.154451,LEXUS,1,1.0,13,I,4
4,Toyota Venza,94177,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,red,15.814961,TOYOTA,1,1.0,10,V,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1010,Toyota Corolla,46768,Lagos,Foreign Used,4-cylinder(I4),automatic,petrol,black,15.504683,TOYOTA,1,1.0,6,I,4
1011,Toyota Camry,31600,Abuja,Foreign Used,4-cylinder(I4),automatic,petrol,silver,15.100602,TOYOTA,1,1.0,9,I,4
1012,Toyota Camry,96802,Abuja,Foreign Used,4-cylinder(I4),automatic,petrol,black,15.043688,TOYOTA,1,1.0,9,I,4
1013,Lexus GX 460,146275,Lagos,Foreign Used,6-cylinder(V6),automatic,petrol,gold,16.476818,LEXUS,1,1.0,7,V,6


In [144]:
data = setup(data=train, target="log_target", train_size=0.8)

Unnamed: 0,Description,Value
0,session_id,3491
1,Target,log_target
2,Original Data,"(1015, 15)"
3,Missing Values,False
4,Numeric Features,2
5,Categorical Features,12
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(812, 238)"


In [None]:
def nmae(answer, pred):
    mae = np.mean(np.abs(answer-pred))
    score = mae / np.mean(np.abs(answer))
    return score

In [145]:
add_metric("nmae2","NMAE2",nmae, False)

Name                                                      NMAE2
Display Name                                              NMAE2
Score Function            <function nmae at 0x000002023157CE50>
Scorer               make_scorer(nmae, greater_is_better=False)
Target                                                     pred
Args                                                         {}
Greater is Better                                         False
Custom                                                     True
Name: nmae2, dtype: object

In [146]:
blender = blend_models(estimator_list=compare_models(n_select=3, sort="NMAE2"))

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE,NMAE2
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.1839,0.0651,0.2552,0.9226,0.0156,0.0119,0.0119
1,0.2184,0.0874,0.2956,0.9105,0.0181,0.0144,0.0142
2,0.2305,0.1112,0.3335,0.8975,0.0197,0.0147,0.0149
3,0.229,0.1047,0.3235,0.8932,0.0198,0.015,0.015
4,0.2548,0.1684,0.4104,0.8468,0.0239,0.0161,0.0164
5,0.2823,0.232,0.4816,0.8203,0.0292,0.0183,0.0184
6,0.2094,0.0893,0.2988,0.8898,0.0182,0.0137,0.0137
7,0.2113,0.0994,0.3152,0.8746,0.0197,0.0139,0.0138
8,0.2372,0.2478,0.4978,0.7207,0.0287,0.015,0.0154
9,0.2308,0.1389,0.3727,0.8346,0.0223,0.0148,0.0149


In [147]:
final = finalize_model(blender)

In [148]:
y_pred = predict_model(final, data=test)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,NMAE2
0,Voting Regressor,0,0,0,0,0,0,


In [149]:
y_pred = y_pred.reset_index()

In [150]:
y_pred_exp = np.exp(y_pred["Label"])

In [151]:
# y_pred_exp
submission["target"] = y_pred_exp

In [152]:
submission.to_csv("submission_6.csv", index=False)

In [153]:
pd.read_csv("submission_6.csv")

Unnamed: 0,id,target
0,0,1.623494e+07
1,1,5.086783e+06
2,2,7.526205e+06
3,3,1.059385e+06
4,4,2.869263e+06
...,...,...
431,431,1.041619e+07
432,432,5.376729e+06
433,433,9.909768e+06
434,434,3.182584e+06


## scaling

In [475]:
from sklearn.preprocessing import OneHotEncoder
nom_feature = ["location","isimported","paint","com","engine_type","transmission","fuel"]
OHE = OneHotEncoder()
for i in nom_feature:
    temp = pd.DataFrame(OHE.fit_transform(all_data[[i]]).toarray())
    all_data = pd.concat([all_data, temp], axis=1)

In [476]:
all_data = all_data.drop(nom_feature, axis=1)

In [477]:
mmscaler = MinMaxScaler()
all_data["old"] = mmscaler.fit_transform(all_data[["old"]])
all_data["odometer"] = mmscaler.fit_transform(all_data[["odometer"]])
all_data["cylinder"] = mmscaler.fit_transform(all_data[["cylinder"]])
scaler = MinMaxScaler()
all_data["log_target"] = scaler.fit_transform(all_data[["log_target"]])

In [478]:
all_data.tail(3)

Unnamed: 0,odometer,log_target,old,cylinder,0,1,2,3,4,5,0.1,1.1,2.1,0.2,1.2,2.2,3.1,4.1,5.1,6,7,8,9,10,11,...,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,0.3,1.3,2.3,3.2,0.4,1.4,0.5,1.5
1448,0.05685,,0.175,0.333333,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1449,0.045879,,0.425,0.333333,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
1450,0.000364,,0.35,0.666667,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0


## 데이터 나누기

In [141]:
train = all_data[~all_data["log_target"].isnull()]
test = all_data[all_data["log_target"].isnull()]

In [47]:
train

Unnamed: 0,odometer,location,isimported,transmission,fuel,paint,log_target,com,old,engine_type,cylinder
0,18277,Lagos,Foreign Used,automatic,petrol,red,16.430348,TOYOTA,4,I,4
1,10,Lagos,New,automatic,petrol,black,17.312473,TOYOTA,1,I,4
2,83091,Lagos,Foreign Used,automatic,petrol,red,16.109559,LAND,8,V,6
3,91524,Lagos,Foreign Used,automatic,petrol,grey,15.154451,LEXUS,13,I,4
4,94177,Lagos,Foreign Used,automatic,petrol,red,15.814961,TOYOTA,10,V,6
...,...,...,...,...,...,...,...,...,...,...,...
1010,46768,Lagos,Foreign Used,automatic,petrol,black,15.504683,TOYOTA,6,I,4
1011,31600,Abuja,Foreign Used,automatic,petrol,silver,15.100602,TOYOTA,9,I,4
1012,96802,Abuja,Foreign Used,automatic,petrol,black,15.043688,TOYOTA,9,I,4
1013,146275,Lagos,Foreign Used,automatic,petrol,gold,16.476818,LEXUS,7,V,6


In [49]:
test= test.drop("log_target", axis=1)

In [50]:
test

Unnamed: 0,odometer,location,isimported,transmission,fuel,paint,com,old,engine_type,cylinder
1015,1234,Abuja,New,automatic,petrol,white,MERCEDES-BENZ,3,I,4
1016,29938,Abuja,Foreign Used,automatic,petrol,white,HONDA,7,I,4
1017,87501,Lagos,Foreign Used,automatic,petrol,black,MERCEDES-BENZ,8,I,4
1018,180894,Lagos,Locally used,automatic,petrol,grey,TOYOTA,19,V,6
1019,104814,Lagos,Foreign Used,automatic,petrol,white,TOYOTA,20,I,4
...,...,...,...,...,...,...,...,...,...,...
1446,78175,Lagos,Foreign Used,automatic,petrol,blue,MERCEDES-BENZ,6,V,6
1447,129223,Lagos,Foreign Used,automatic,petrol,red,HONDA,9,V,6
1448,100943,Lagos,Foreign Used,automatic,petrol,black,MERCEDES-BENZ,7,I,4
1449,81463,Lagos,Foreign Used,automatic,petrol,green,LEXUS,17,I,4


In [480]:
X_train = train.drop(["log_target"], axis=1)
y_train = train["log_target"]
X_test = test.drop(["log_target"], axis=1)
y_test = test["log_target"]

In [481]:
# from sklearn.model_selection import train_test_split
# X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2)

In [487]:
X_train.shape

(1015, 73)

# 모델링

In [488]:
import tensorflow.keras
from keras.models import Sequential
from keras.layers import Dense
from sklearn.preprocessing import MinMaxScaler

model = Sequential()
model.add(Dense(26, input_dim=73, activation='relu'))
model.add(Dense(25, activation='relu'))
model.add(Dense(1, activation='linear'))
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 26)                1924      
_________________________________________________________________
dense_13 (Dense)             (None, 25)                675       
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 26        
Total params: 2,625
Trainable params: 2,625
Non-trainable params: 0
_________________________________________________________________


In [489]:
model.compile(optimizer='adam', loss='mean_squared_error')

In [490]:
epochs_hist = model.fit(X_train, y_train, epochs=30, batch_size=15,  verbose=1, validation_split=0.2)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [491]:
y_predict = model.predict(X_test)

In [492]:
temp = np.exp(scaler.inverse_transform(y_predict))

In [493]:
submission["target"] = temp

In [497]:
submission.to_csv("submission_3.csv", index_label=False)

In [498]:
pd.read_csv("submission_3.csv")

Unnamed: 0,id,target
0,0,12896974.0
1,1,8234156.0
2,2,9365128.0
3,3,1450042.2
4,4,2401212.8
...,...,...
431,431,11280826.0
432,432,5793795.0
433,433,10631469.0
434,434,4403913.5
