# 중고차 가격 예측하기

<span style="color:blue"> 환경 준비 </span>

In [25]:
# 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%config InlineBackend.figure_format = 'retina'


In [77]:
# 데이터 읽어오기
path = 'D:\PRACTICE\\used_cars.csv'
data = pd.read_csv(path)

<br/>
<span style="color:blue"> 데이터 이해 </span>

In [4]:
# 상위 몇 개 행 확인
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$10,300"
1,Hyundai,Palisade SEL,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,"$38,005"
2,Lexus,RX 350 RX 350,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,"$54,598"
3,INFINITI,Q50 Hybrid Sport,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,"$15,500"
4,Audi,Q3 45 S line Premium Plus,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,"$34,999"


In [5]:
data.tail()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
4004,Bentley,Continental GT Speed,2023,714 mi.,Gasoline,6.0L W12 48V PDI DOHC Twin Turbo,8-Speed Automatic with Auto-Shift,C / C,Hotspur,None reported,Yes,"$349,950"
4005,Audi,S4 3.0T Premium Plus,2022,"10,900 mi.",Gasoline,349.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,"$53,900"
4006,Porsche,Taycan,2022,"2,116 mi.",,Electric,Automatic,Black,Black,None reported,,"$90,998"
4007,Ford,F-150 Raptor,2020,"33,000 mi.",Gasoline,450.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,Blue,Black,None reported,Yes,"$62,999"
4008,BMW,X3 xDrive30i,2020,"43,000 mi.",Gasoline,248.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Brown,At least 1 accident or damage reported,Yes,"$40,000"


In [6]:
# 변수 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   object
 4   fuel_type     3839 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      3896 non-null   object
 10  clean_title   3413 non-null   object
 11  price         4009 non-null   object
dtypes: int64(1), object(11)
memory usage: 376.0+ KB


* **Brand & Model:** Identify the brand or company name along with the specific model of each vehicle.
* **Model Year:** Discover the manufacturing year of the vehicles, crucial for assessing depreciation and technology advancements.
* **Mileage:** Obtain the mileage of each vehicle, a key indicator of wear and tear and potential maintenance requirements.
* **Fuel Type:** Learn about the type of fuel the vehicles run on, whether it's gasoline, diesel, electric, or hybrid.
* **Engine Type:** Understand the engine specifications, shedding light on performance and efficiency.
* **Transmission:** Determine the transmission type, whether automatic, manual, or another variant.
* **Exterior & Interior Colors:** Explore the aesthetic aspects of the vehicles, including exterior and interior color options.
* **Accident History:** Discover whether a vehicle has a prior history of accidents or damage, crucial for informed decision-making.
* **Clean Title:** Evaluate the availability of a clean title, which can impact the vehicle's resale value and legal status.
* **Price:** Access the listed prices for each vehicle, aiding in price comparison and budgeting

<br/>
<span style="color:blue"> 데이터 준비 </span>

### 결측치 처리

In [7]:
data.isna().sum()

brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64

**fuel_type 결측치 처리**

model 값이 동일한 행의 fuel_type 값을 넣어줌

In [78]:
# 모델을 기준으로 정렬
data = data.sort_values('model')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
704,FIAT,124 Spider Abarth,2017,"45,000 mi.",Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,At least 1 accident or damage reported,Yes,"$22,500"
2900,BMW,128 i,2013,"67,874 mi.",Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,None reported,Yes,"$18,000"
509,BMW,135 i,2008,"87,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,None reported,Yes,"$15,300"
1507,BMW,135 i,2011,"132,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,None reported,Yes,"$11,000"
715,BMW,135 i,2009,"72,900 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,None reported,Yes,"$15,500"
...,...,...,...,...,...,...,...,...,...,...,...,...
3855,Scion,tC Anniversary Edition,2014,"99,999 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,White,Black,None reported,Yes,"$10,998"
1284,Scion,tC Anniversary Edition,2014,"115,000 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Silver,Silver,None reported,Yes,"$11,495"
177,Scion,tC Base,2013,"177,600 mi.",Gasoline,180.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,Silver,Black,At least 1 accident or damage reported,Yes,"$7,100"
690,Scion,tC Release Series 6.0,2010,"120,010 mi.",Gasoline,161.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,4-Speed A/T,Gray,Black,At least 1 accident or damage reported,Yes,"$6,500"


In [79]:
df = data.groupby('model', as_index = False)[['fuel_type']].count()
df

Unnamed: 0,model,fuel_type
0,124 Spider Abarth,1
1,128 i,1
2,135 i,3
3,135 is,1
4,1500 Big Horn,11
...,...,...
1893,i8 Base,5
1894,tC Anniversary Edition,2
1895,tC Base,1
1896,tC Release Series 6.0,1


In [80]:
# fuel_type 값이 NaN인데 동일한 model 값도 존재하지 않는 경우
df[df['fuel_type'] == 0]

Unnamed: 0,model,fuel_type
18,2 Launch Edition,0
95,500e Battery Electric,0
258,Air Grand Touring,0
259,Air Pure,0
309,Bolt EUV Premier,0
...,...,...
1887,e-tron Premium,0
1888,e-tron Prestige,0
1890,i3 94 Ah,0
1891,i3 Base,0


In [81]:
type(df)

pandas.core.frame.DataFrame

In [82]:
# fuel_type 열의 최빈값
data['fuel_type'].mode()

0    Gasoline
dtype: object

In [83]:
for index, row in data.iterrows() :
    if (type(row.fuel_type) == str) and (len(row.fuel_type) >= 2) :
        continue
        
    else :  
        d = df.loc[df['model'] == row.model]
        
        if (d['fuel_type'].any() != 0) :
            try : 
                data.fuel_type.fillna(method = 'bfill', limit = 1, inplace = True)
            except :
                continue
                
        else :
            try :
                data.fuel_type.fillna('Gasoline', inplace = True)
            except :
                continue

data.isna().sum()

brand             0
model             0
model_year        0
milage            0
fuel_type         0
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64

In [84]:
data = data.sort_values('fuel_type')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
1940,Ford,F-250 Lariat,2019,"85,000 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$52,500"
1059,Chevrolet,Silverado 3500 High Country,2015,"77,500 mi.",Diesel,397.0HP 6.6L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,Black,Brown,None reported,Yes,"$54,000"
1281,Chevrolet,Silverado 3500 LTZ,2022,"85,200 mi.",Diesel,445.0HP 6.6L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Gray,None reported,Yes,"$67,500"
1635,Chevrolet,Silverado 3500 LTZ,2018,"92,149 mi.",Diesel,6.6L V8 32V DDI OHV Turbo Diesel,6-Speed Automatic,Black,Jet Black,,,"$52,889"
502,Chevrolet,Express 3500 LT,2016,"120,000 mi.",Diesel,260.0HP 6.6L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Gray,None reported,Yes,"$19,500"
...,...,...,...,...,...,...,...,...,...,...,...,...
2303,Acura,NSX Base,1993,"75,980 mi.",–,–,A/T,Silver,Black,None reported,Yes,"$90,200"
2103,Volvo,850 Turbo,1995,"94,000 mi.",–,–,A/T,White,Black,None reported,Yes,"$4,500"
855,Ford,Bronco,1974,"6,217 mi.",–,–,–,Dark Gray Metallic,–,None reported,Yes,"$115,000"
3213,Ford,Mustang EcoBoost Premium,2019,"31,000 mi.",–,–,6-Speed M/T,Gray,Black,None reported,Yes,"$34,700"


In [85]:
data['fuel_type'].unique()

array(['Diesel', 'E85 Flex Fuel', 'Gasoline', 'Hybrid', 'Plug-In Hybrid',
       'not supported', '–'], dtype=object)

In [86]:
data = data.sort_values('model')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
704,FIAT,124 Spider Abarth,2017,"45,000 mi.",Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,At least 1 accident or damage reported,Yes,"$22,500"
2900,BMW,128 i,2013,"67,874 mi.",Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,None reported,Yes,"$18,000"
715,BMW,135 i,2009,"72,900 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,None reported,Yes,"$15,500"
1507,BMW,135 i,2011,"132,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,None reported,Yes,"$11,000"
509,BMW,135 i,2008,"87,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,None reported,Yes,"$15,300"
...,...,...,...,...,...,...,...,...,...,...,...,...
3855,Scion,tC Anniversary Edition,2014,"99,999 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,White,Black,None reported,Yes,"$10,998"
1284,Scion,tC Anniversary Edition,2014,"115,000 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Silver,Silver,None reported,Yes,"$11,495"
177,Scion,tC Base,2013,"177,600 mi.",Gasoline,180.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,Silver,Black,At least 1 accident or damage reported,Yes,"$7,100"
690,Scion,tC Release Series 6.0,2010,"120,010 mi.",Gasoline,161.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,4-Speed A/T,Gray,Black,At least 1 accident or damage reported,Yes,"$6,500"


In [87]:
# '–'로 잘못 표기된 NaN 값들을 NaN으로 바꿔주기
for index, row in data.iterrows() :
    if (type(row.fuel_type) == str) and (len(row.fuel_type) >= 2) :
        continue
        
    else :  
        data.loc[index, 'fuel_type'] = np.NaN

In [88]:
data.isna().sum()

brand             0
model             0
model_year        0
milage            0
fuel_type        45
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64

In [89]:
# NaN 값들을 다시 한 번 bfill로 채워줌
for index, row in data.iterrows() :
    if (type(row.fuel_type) == str) and (len(row.fuel_type) >= 2) :
        continue
        
    else :
        try : 
            data.fuel_type.fillna(method = 'bfill', limit = 1, inplace = True)
        except :
            continue

In [90]:
data = data.sort_values('fuel_type')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3292,Mercedes-Benz,Sprinter 2500 Standard Roof,2021,"13,344 mi.",Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,None reported,Yes,"$56,500"
3550,Ford,F-350 King Ranch,2015,"92,421 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Brown,None reported,Yes,"$51,900"
3708,Ford,F-350 Lariat,2018,"66,281 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes,"$63,500"
3261,Ford,F-350 Lariat,2015,"162,000 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Red,Beige,None reported,Yes,"$32,000"
2170,Ford,F-350 Lariat,2021,"28,220 mi.",Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,None reported,Yes,"$73,600"
...,...,...,...,...,...,...,...,...,...,...,...,...
3825,Toyota,Prius Plug-in Base,2014,"106,000 mi.",Plug-In Hybrid,134.0HP 1.8L 4 Cylinder Engine Plug-In Electri...,A/T,Green,Beige,At least 1 accident or damage reported,Yes,"$11,800"
1820,Volvo,XC90 Recharge Plug-In Hybrid T8 Inscription 7 ...,2022,"7,800 mi.",Plug-In Hybrid,455.0HP 2.0L 4 Cylinder Engine Plug-In Electri...,8-Speed A/T,Black,Beige,None reported,Yes,"$66,000"
1509,Hyundai,IONIQ 5 SE,2022,"18,500 mi.",Plug-In Hybrid,320.0HP Electric Motor Electric Fuel System,A/T,White,Black,None reported,Yes,"$42,000"
3700,Toyota,Mirai Base,2016,"40,000 mi.",not supported,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,None reported,Yes,"$9,500"


In [91]:
data['fuel_type'].unique()

array(['Diesel', 'E85 Flex Fuel', 'Gasoline', 'Hybrid', 'Plug-In Hybrid',
       'not supported'], dtype=object)

----------

**accident 결측치 처리**

결측치인 행 제외

In [92]:
# 값 확인
data['accident'].unique()

array(['None reported', 'At least 1 accident or damage reported', nan],
      dtype=object)

In [93]:
# accident 값을 숫자로 바꿔주기 (가변수화)
data = data.replace({'accident': 'None reported'}, 0)
data = data.replace({'accident': 'At least 1 accident or damage reported'}, 1)
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3292,Mercedes-Benz,Sprinter 2500 Standard Roof,2021,"13,344 mi.",Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,"$56,500"
3550,Ford,F-350 King Ranch,2015,"92,421 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Brown,0.0,Yes,"$51,900"
3708,Ford,F-350 Lariat,2018,"66,281 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,"$63,500"
3261,Ford,F-350 Lariat,2015,"162,000 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Red,Beige,0.0,Yes,"$32,000"
2170,Ford,F-350 Lariat,2021,"28,220 mi.",Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,"$73,600"
...,...,...,...,...,...,...,...,...,...,...,...,...
3825,Toyota,Prius Plug-in Base,2014,"106,000 mi.",Plug-In Hybrid,134.0HP 1.8L 4 Cylinder Engine Plug-In Electri...,A/T,Green,Beige,1.0,Yes,"$11,800"
1820,Volvo,XC90 Recharge Plug-In Hybrid T8 Inscription 7 ...,2022,"7,800 mi.",Plug-In Hybrid,455.0HP 2.0L 4 Cylinder Engine Plug-In Electri...,8-Speed A/T,Black,Beige,0.0,Yes,"$66,000"
1509,Hyundai,IONIQ 5 SE,2022,"18,500 mi.",Plug-In Hybrid,320.0HP Electric Motor Electric Fuel System,A/T,White,Black,0.0,Yes,"$42,000"
3700,Toyota,Mirai Base,2016,"40,000 mi.",not supported,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,0.0,Yes,"$9,500"


In [94]:
data['accident'].value_counts()

0.0    2910
1.0     986
Name: accident, dtype: int64

In [95]:
# accident가 결측치인 경우는 제외하기로 함
data = data.loc[data['accident'].notnull()]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3896 entries, 3292 to 2894
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   object 
 1   model         3896 non-null   object 
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   object 
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3413 non-null   object 
 11  price         3896 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 395.7+ KB


-------

**clean_title 결측치 처리**

결측치를 'No'로 변경

In [41]:
data['clean_title'].value_counts()

Yes    3413
Name: clean_title, dtype: int64

In [43]:
data['clean_title'].unique()

array(['Yes', nan], dtype=object)

In [96]:
# 결측치를 No로 변경
data.clean_title.fillna('No', inplace = True)
data['clean_title'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Yes    3413
No      483
Name: clean_title, dtype: int64

------

### 가변수화

In [97]:
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3292,Mercedes-Benz,Sprinter 2500 Standard Roof,2021,"13,344 mi.",Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,"$56,500"
3550,Ford,F-350 King Ranch,2015,"92,421 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Brown,0.0,Yes,"$51,900"
3708,Ford,F-350 Lariat,2018,"66,281 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,"$63,500"
3261,Ford,F-350 Lariat,2015,"162,000 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Red,Beige,0.0,Yes,"$32,000"
2170,Ford,F-350 Lariat,2021,"28,220 mi.",Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,"$73,600"


**price 가변수화**

In [108]:
# $ 지우기
for index, row in data.iterrows():
    data.loc[index, 'price'] = data.loc[index, 'price'].strip("$")

30,999
54,998
46,598
41,899
5,500
9,000
39,998
21,900
19,500
39,650
53,000
52,000
65,000
41,998
43,500
16,500
38,781
32,000
38,598
11,500
15,999
24,000
15,499
15,051
10,000
4,900
7,899
90,200
58,998
4,000
5,000
47,950
23,500
40,598
36,570
48,998
39,798
10,995
6,700
10,999
5,500
34,998
8,500
21,000
14,500
13,888
11,000
14,798
25,000
26,599
20,999
26,000
15,750
143,900
17,500
24,840
27,698
33,000
12,000
7,700
18,998
19,000
15,450
43,900
28,900
26,600
32,400
53,900
27,500
43,900
39,000
28,500
75,900
35,345
75,000
20,500
28,900
21,000
35,645
156,900
33,995
39,000
159,500
184,606
69,995
80,000
279,950
18,999
47,000
40,000
22,000
19,998
39,998
18,000
11,000
5,700
59,900
37,000
42,500
33,000
61,059
52,500
48,500
37,499
52,000
62,000
22,000
20,499
76,400
74,900
20,995
50,000
52,598
43,500
47,798
38,999
52,900
24,700
42,000
21,990
19,500
20,000
23,750
30,000
22,500
39,900
61,998
162,500
38,000
38,999
35,999
44,900
67,900
39,998
59,200
45,000
67,000
27,000
47,995
50,000
128,900
91,500
12,000
52,

45,023
86,500
7,999
19,795
9,495
6,500
29,182
29,614
31,000
58,504
48,999
51,999
31,621
68,995
10,995
63,500
19,995
28,979
33,172
46,552
32,769
61,500
31,000
46,900
6,450
34,000
6,999
35,690
8,500
8,250
19,395
7,500
11,500
62,900
91,000
77,999
90,788
16,000
61,900
94,999
82,950
62,890
12,599
60,867
45,950
57,352
44,605
34,500
28,590
16,250
26,900
20,000
10,999
62,999
8,500
3,000
30,825
51,500
36,750
47,995
15,499
77,900
39,000
72,950
47,850
9,000
72,900
18,300
30,510
39,979
15,000
89,000
15,000
54,000
6,299
26,000
9,500
3,000
47,500
37,500
60,000
54,999
16,500
54,490
30,000
54,000
19,995
31,500
35,000
16,500
31,933
34,995
65,900
23,598
24,998
48,459
9,999
70,000
44,000
64,500
66,500
115,000
56,900
51,000
65,994
69,999
71,900
9,500
38,000
77,900
53,500
36,000
26,500
14,000
38,500
23,000
42,500
46,995
30,500
26,900
49,000
42,500
37,000
7,850
16,000
13,500
66,400
49,990
44,999
10,000
13,000
11,499
13,895
23,999
35,500
59,000
12,000
9,499
12,500
25,000
24,350
15,000
68,000
31,000
38,000
9,

279,000
41,900
28,700
69,645
43,000
7,999
13,000
38,900
37,900
84,645
67,000
44,999
44,999
52,900
19,000
36,000
41,900
64,250
9,000
17,800
18,000
12,900
11,500
8,500
28,000
11,500
6,000
11,500
12,000
20,500
19,212
25,099
19,900
29,995
22,966
22,000
24,995
25,215
4,999
2,500
13,250
11,500
20,500
12,900
11,000
18,900
20,199
33,687
35,619
18,500
36,900
40,655
16,770
32,250
27,500
8,300
32,541
36,969
5,499
6,950
14,311
21,800
26,288
22,389
25,495
26,485
25,382
14,000
6,800
52,991
6,300
15,000
29,255
17,599
6,500
25,500
24,899
27,518
28,535
4,300
179,900
171,900
185,000
324,995
139,000
279,950
22,500
8,995
21,500
19,500
65,000
83,598
75,000
39,500
13,999
135,800
204,900
215,000
142,998
73,000
128,000
84,500
130,000
65,000
133,000
69,500
11,999
9,900
16,500
43,999
11,900
62,479
29,000
130,000
83,999
200,000
31,999
22,000
24,500
175,000
23,500
26,000
35,750
33,995
39,853
47,645
105,500
38,880
38,598
18,995
11,900
9,250
33,490
56,999
13,600
28,500
5,800
16,500
72,999
127,899
24,000
53,800
13,0

38,700
23,500
48,500
27,750
18,500
46,000
36,999
39,857
43,837
40,368
31,906
33,000
27,999
35,500
30,500
29,000
16,900
46,500
36,990
8,550
23,500
52,590
24,999
36,000
33,333
38,298
3,500
8,000
13,000
12,000
27,500
16,750
24,325
14,500
15,550
9,850
13,200
20,900
45,000
39,998
35,000
41,500
25,998
47,999
23,000
14,980
19,290
17,000
15,995
13,000
9,750
15,000
20,900
13,800
4,300
6,000
9,499
48,000
16,500
29,950
20,000
14,000
18,999
16,000
5,500
32,000
38,322
28,890
25,500
27,900
12,449
10,000
5,000
3,500
12,000
13,000
29,798
9,000
30,000
38,698
23,000
23,999
32,750
46,900
34,645
39,127
19,000
24,999
36,800
44,900
7,000
29,000
42,000
8,000
65,000
3,550
27,800
5,800
10,500
18,000
30,000
66,000
4,500
17,900
10,800
36,000
15,900
61,999
10,299
5,500
54,900
36,500
13,900
7,500
26,599
50,000
54,599
29,900
30,500
28,825
25,999
11,500
6,499
37,000
22,000
5,000


In [109]:
# , 지우기
for index, row in data.iterrows():
    data.loc[index, 'price'] = data.loc[index, 'price'].replace(',', '')

30999
54998
46598
41899
5500
9000
39998
21900
19500
39650
53000
52000
65000
41998
43500
16500
38781
32000
38598
11500
15999
24000
15499
15051
10000
4900
7899
90200
58998
4000
5000
47950
23500
40598
36570
48998
39798
10995
6700
10999
5500
34998
8500
21000
14500
13888
11000
14798
25000
26599
20999
26000
15750
143900
17500
24840
27698
33000
12000
7700
18998
19000
15450
43900
28900
26600
32400
53900
27500
43900
39000
28500
75900
35345
75000
20500
28900
21000
35645
156900
33995
39000
159500
184606
69995
80000
279950
18999
47000
40000
22000
19998
39998
18000
11000
5700
59900
37000
42500
33000
61059
52500
48500
37499
52000
62000
22000
20499
76400
74900
20995
50000
52598
43500
47798
38999
52900
24700
42000
21990
19500
20000
23750
30000
22500
39900
61998
162500
38000
38999
35999
44900
67900
39998
59200
45000
67000
27000
47995
50000
128900
91500
12000
52000
119900
54900
29995
13500
34999
59950
30000
78500
78900
33610
19000
15495
25999
31000
49936
21375
15400
13800
34995
17000
36340
39900
31000
1

55000
35000
9200
29800
64000
12500
36500
38500
51900
35900
40100
3900
20000
22000
16000
29950
10300
57000
14300
8999
27000
22000
23500
32300
18500
60999
49599
54000
9985
4700
15500
4500
30490
69998
59000
12990
28900
12240
61995
41999
45000
32500
31000
28500
19000
26500
33900
10300
29000
16995
52500
38000
10500
42500
73500
73600
42500
47756
26095
18750
13000
50000
40000
45998
41000
15900
41000
32000
99750
40000
18000
49500
31900
20000
49900
84000
51899
52000
33000
51500
70250
27995
39750
5800
14500
24500
32900
15500
36750
67963
55000
81500
16900
36500
22989
6100
38000
32750
25500
25000
34999
24950
82000
12500
7500
25500
14000
9500
4999
71999
92888
42900
51000
29500
45500
42995
30000
32000
38436
19780
30000
34000
30900
27700
27654
12433
12995
57850
24440
17000
44500
7900
27633
74500
30000
43999
22999
35899
62499
34000
93999
56900
18400
29900
12250
27785
25925
22000
6500
9500
12300
73000
53000
67800
58500
35500
5000
13500
69500
49000
50500
53950
29500
32990
44249
41299
54543
29999
48219
4

15000
229000
19999
38998
39998
104999
99900
79999
19800
16500
78000
144664
4200
23000
15500
5100
6000
10000
4495
4200
37999
8950
28000
28000
3000
34999
57000
27000
5800
6750
8750
11970
19500
10500
33000
6700
29895
28196
20900
20999
59950
15998
25884
13000
28906
14999
21500
21500
31950
4200
13999
23800
18998
11600
2899
21000
8999
9000
21500
29000
23599
15000
11995
27900
47214
6900
20000
12500
15900
17500
11500
23625
6750
29725
69500
3950
7000
32324
48000
3850
6700
14800
6999
65990
7500
12000
25500
16995
50998
10200
12000
13500
28000
9950
11000
9975
39884
38000
32612
30999
37000
26498
36500
29000
3200
131999
9950
95000
6700
10900
17000
15995
6750
8500
8500
21865
13000
63900
41000
19500
29510
114000
19599
57000
72999
18999
28999
15500
149500
71999
113000
2300
26999
24980
59000
5899
4000
5000
26500
13998
7000
29500
19500
15500
8250
8500
17000
4500
17000
28500
35999
2500
11000
17900
13300
21000
19950
19000
5995
7200
21999
12850
10000
17500
12900
10500
32000
72000
115000
50999
30000
75000
37

In [111]:
# str -> int 로 단위 바꾸기
data['price'] = data['price'].astype('int64')
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,TLX V6 Advance,2019,"25,779 mi.",Gasoline,3.5L V6 24V GDI SOHC,9-Speed Automatic,Platinum White Pearl,Ebony,1.0,No,30999
1,0,MDX w/Technology Package,2023,"3,415 mi.",Gasoline,3.5 Liter SOHC,F,White,Parchment.,0.0,No,54998
2,0,MDX w/Technology Package,2022,"30,177 mi.",Gasoline,3.5L 24V SOHC I-VTEC V6,2,Majestic Black Pearl,Espresso,0.0,No,46598
3,0,MDX Sport Hybrid 3.0L w/Technology Package,2019,"56,778 mi.",Hybrid,321.0HP 3.0L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,0.0,Yes,41899
4,0,TSX Technology,2011,"187,883 mi.",Gasoline,201.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,0.0,Yes,5500


In [112]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   object 
 1   model         3896 non-null   object 
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   object 
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(2), object(9)
memory usage: 365.4+ KB


**brand 가변수화**

In [47]:
data['brand'].value_counts()

Ford             372
BMW              367
Mercedes-Benz    310
Chevrolet        285
Toyota           196
Audi             194
Porsche          186
Lexus            161
Jeep             138
Land             126
Nissan           114
Cadillac         104
RAM               90
Dodge             89
GMC               88
Tesla             87
Kia               75
Hyundai           68
Acura             63
Subaru            63
Mazda             62
Honda             60
Volkswagen        59
INFINITI          56
Lincoln           50
Jaguar            46
Volvo             37
Maserati          33
MINI              32
Bentley           31
Buick             30
Chrysler          27
Lamborghini       26
Mitsubishi        20
Genesis           19
Hummer            16
Alfa              16
Rivian            15
Pontiac           15
Ferrari           12
Rolls-Royce       10
Aston              8
McLaren            6
Scion              6
FIAT               5
Saturn             5
Lotus              3
Mercury      

In [98]:
data = data.sort_values('brand')

data.reset_index(drop = True, inplace = True)
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Acura,TLX V6 Advance,2019,"25,779 mi.",Gasoline,3.5L V6 24V GDI SOHC,9-Speed Automatic,Platinum White Pearl,Ebony,1.0,No,"$30,999"
1,Acura,MDX w/Technology Package,2023,"3,415 mi.",Gasoline,3.5 Liter SOHC,F,White,Parchment.,0.0,No,"$54,998"
2,Acura,MDX w/Technology Package,2022,"30,177 mi.",Gasoline,3.5L 24V SOHC I-VTEC V6,2,Majestic Black Pearl,Espresso,0.0,No,"$46,598"
3,Acura,MDX Sport Hybrid 3.0L w/Technology Package,2019,"56,778 mi.",Hybrid,321.0HP 3.0L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,0.0,Yes,"$41,899"
4,Acura,TSX Technology,2011,"187,883 mi.",Gasoline,201.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,0.0,Yes,"$5,500"


In [52]:
len(data)

3896

In [70]:
data['brand'].value_counts()

Ford             310
BMW              306
Mercedes-Benz    258
Chevrolet        237
Toyota           163
                ... 
225                1
226                1
227                1
229                1
185                1
Name: brand, Length: 360, dtype: int64

In [104]:
k = 0
for i in range(0, len(data)) :
    print(i)
    if(i + 1 >= len(data)) :
        data.iloc[i, 0] = k
        break
    
    else :
        if(data.iloc[i, 0] == data.iloc[i + 1, 0]) :
            print('case2')
            print('k=', k)
            data.iloc[i, 0] = k
        
        else :
            print('case3')
            print('k=', k)
            data.iloc[i, 0] = k
            k = k + 1

data.head()

0
case2
k= 0
1
case2
k= 0
2
case2
k= 0
3
case2
k= 0
4
case2
k= 0
5
case2
k= 0
6
case2
k= 0
7
case2
k= 0
8
case2
k= 0
9
case2
k= 0
10
case2
k= 0
11
case2
k= 0
12
case2
k= 0
13
case2
k= 0
14
case2
k= 0
15
case2
k= 0
16
case2
k= 0
17
case2
k= 0
18
case2
k= 0
19
case2
k= 0
20
case2
k= 0
21
case2
k= 0
22
case2
k= 0
23
case2
k= 0
24
case2
k= 0
25
case2
k= 0
26
case2
k= 0
27
case2
k= 0
28
case2
k= 0
29
case2
k= 0
30
case2
k= 0
31
case2
k= 0
32
case2
k= 0
33
case2
k= 0
34
case2
k= 0
35
case2
k= 0
36
case2
k= 0
37
case2
k= 0
38
case2
k= 0
39
case2
k= 0
40
case2
k= 0
41
case2
k= 0
42
case2
k= 0
43
case2
k= 0
44
case2
k= 0
45
case2
k= 0
46
case2
k= 0
47
case2
k= 0
48
case2
k= 0
49
case2
k= 0
50
case2
k= 0
51
case2
k= 0
52
case2
k= 0
53
case2
k= 0
54
case2
k= 0
55
case2
k= 0
56
case2
k= 0
57
case2
k= 0
58
case2
k= 0
59
case2
k= 0
60
case2
k= 0
61
case2
k= 0
62
case3
k= 0
63
case2
k= 1
64
case2
k= 1
65
case2
k= 1
66
case2
k= 1
67
case2
k= 1
68
case2
k= 1
69
case2
k= 1
70
case2
k= 1
71
case2
k= 1
72

case2
k= 4
606
case2
k= 4
607
case2
k= 4
608
case2
k= 4
609
case2
k= 4
610
case2
k= 4
611
case2
k= 4
612
case2
k= 4
613
case2
k= 4
614
case2
k= 4
615
case2
k= 4
616
case2
k= 4
617
case2
k= 4
618
case2
k= 4
619
case2
k= 4
620
case2
k= 4
621
case2
k= 4
622
case2
k= 4
623
case2
k= 4
624
case2
k= 4
625
case2
k= 4
626
case2
k= 4
627
case2
k= 4
628
case2
k= 4
629
case2
k= 4
630
case2
k= 4
631
case2
k= 4
632
case2
k= 4
633
case2
k= 4
634
case2
k= 4
635
case2
k= 4
636
case2
k= 4
637
case2
k= 4
638
case2
k= 4
639
case2
k= 4
640
case2
k= 4
641
case2
k= 4
642
case2
k= 4
643
case2
k= 4
644
case2
k= 4
645
case2
k= 4
646
case2
k= 4
647
case3
k= 4
648
case2
k= 5
649
case2
k= 5
650
case2
k= 5
651
case2
k= 5
652
case2
k= 5
653
case2
k= 5
654
case2
k= 5
655
case2
k= 5
656
case2
k= 5
657
case2
k= 5
658
case2
k= 5
659
case2
k= 5
660
case2
k= 5
661
case2
k= 5
662
case2
k= 5
663
case2
k= 5
664
case2
k= 5
665
case2
k= 5
666
case2
k= 5
667
case2
k= 5
668
case2
k= 5
669
case2
k= 5
670
case2
k= 5
671
case2
k= 5

1250
case2
k= 14
1251
case2
k= 14
1252
case2
k= 14
1253
case2
k= 14
1254
case2
k= 14
1255
case2
k= 14
1256
case2
k= 14
1257
case2
k= 14
1258
case2
k= 14
1259
case2
k= 14
1260
case2
k= 14
1261
case2
k= 14
1262
case2
k= 14
1263
case2
k= 14
1264
case2
k= 14
1265
case2
k= 14
1266
case2
k= 14
1267
case2
k= 14
1268
case2
k= 14
1269
case2
k= 14
1270
case2
k= 14
1271
case2
k= 14
1272
case2
k= 14
1273
case2
k= 14
1274
case2
k= 14
1275
case2
k= 14
1276
case2
k= 14
1277
case2
k= 14
1278
case2
k= 14
1279
case2
k= 14
1280
case2
k= 14
1281
case2
k= 14
1282
case2
k= 14
1283
case2
k= 14
1284
case2
k= 14
1285
case2
k= 14
1286
case2
k= 14
1287
case2
k= 14
1288
case2
k= 14
1289
case2
k= 14
1290
case2
k= 14
1291
case2
k= 14
1292
case2
k= 14
1293
case2
k= 14
1294
case2
k= 14
1295
case2
k= 14
1296
case2
k= 14
1297
case2
k= 14
1298
case2
k= 14
1299
case2
k= 14
1300
case2
k= 14
1301
case2
k= 14
1302
case2
k= 14
1303
case2
k= 14
1304
case2
k= 14
1305
case2
k= 14
1306
case2
k= 14
1307
case2
k= 14
1308
case2
k= 

1808
case2
k= 19
1809
case2
k= 19
1810
case2
k= 19
1811
case2
k= 19
1812
case2
k= 19
1813
case2
k= 19
1814
case2
k= 19
1815
case2
k= 19
1816
case2
k= 19
1817
case2
k= 19
1818
case2
k= 19
1819
case2
k= 19
1820
case2
k= 19
1821
case2
k= 19
1822
case2
k= 19
1823
case2
k= 19
1824
case2
k= 19
1825
case2
k= 19
1826
case2
k= 19
1827
case2
k= 19
1828
case2
k= 19
1829
case2
k= 19
1830
case2
k= 19
1831
case2
k= 19
1832
case2
k= 19
1833
case2
k= 19
1834
case2
k= 19
1835
case2
k= 19
1836
case2
k= 19
1837
case2
k= 19
1838
case2
k= 19
1839
case2
k= 19
1840
case2
k= 19
1841
case2
k= 19
1842
case2
k= 19
1843
case2
k= 19
1844
case2
k= 19
1845
case2
k= 19
1846
case2
k= 19
1847
case2
k= 19
1848
case2
k= 19
1849
case2
k= 19
1850
case2
k= 19
1851
case2
k= 19
1852
case2
k= 19
1853
case2
k= 19
1854
case3
k= 19
1855
case2
k= 20
1856
case2
k= 20
1857
case2
k= 20
1858
case2
k= 20
1859
case2
k= 20
1860
case2
k= 20
1861
case2
k= 20
1862
case2
k= 20
1863
case2
k= 20
1864
case2
k= 20
1865
case2
k= 20
1866
case2
k= 

2361
case2
k= 27
2362
case2
k= 27
2363
case2
k= 27
2364
case2
k= 27
2365
case2
k= 27
2366
case2
k= 27
2367
case2
k= 27
2368
case2
k= 27
2369
case2
k= 27
2370
case2
k= 27
2371
case2
k= 27
2372
case2
k= 27
2373
case2
k= 27
2374
case2
k= 27
2375
case2
k= 27
2376
case2
k= 27
2377
case2
k= 27
2378
case2
k= 27
2379
case2
k= 27
2380
case2
k= 27
2381
case2
k= 27
2382
case2
k= 27
2383
case2
k= 27
2384
case2
k= 27
2385
case2
k= 27
2386
case2
k= 27
2387
case2
k= 27
2388
case2
k= 27
2389
case2
k= 27
2390
case2
k= 27
2391
case2
k= 27
2392
case2
k= 27
2393
case2
k= 27
2394
case2
k= 27
2395
case2
k= 27
2396
case2
k= 27
2397
case2
k= 27
2398
case2
k= 27
2399
case2
k= 27
2400
case2
k= 27
2401
case2
k= 27
2402
case2
k= 27
2403
case2
k= 27
2404
case2
k= 27
2405
case2
k= 27
2406
case2
k= 27
2407
case2
k= 27
2408
case2
k= 27
2409
case2
k= 27
2410
case2
k= 27
2411
case2
k= 27
2412
case2
k= 27
2413
case2
k= 27
2414
case2
k= 27
2415
case2
k= 27
2416
case2
k= 27
2417
case2
k= 27
2418
case2
k= 27
2419
case2
k= 

2891
case2
k= 36
2892
case2
k= 36
2893
case2
k= 36
2894
case2
k= 36
2895
case2
k= 36
2896
case2
k= 36
2897
case2
k= 36
2898
case2
k= 36
2899
case2
k= 36
2900
case2
k= 36
2901
case2
k= 36
2902
case2
k= 36
2903
case2
k= 36
2904
case2
k= 36
2905
case2
k= 36
2906
case2
k= 36
2907
case2
k= 36
2908
case2
k= 36
2909
case2
k= 36
2910
case2
k= 36
2911
case2
k= 36
2912
case2
k= 36
2913
case2
k= 36
2914
case2
k= 36
2915
case2
k= 36
2916
case2
k= 36
2917
case2
k= 36
2918
case2
k= 36
2919
case2
k= 36
2920
case2
k= 36
2921
case2
k= 36
2922
case2
k= 36
2923
case2
k= 36
2924
case2
k= 36
2925
case2
k= 36
2926
case2
k= 36
2927
case2
k= 36
2928
case2
k= 36
2929
case2
k= 36
2930
case2
k= 36
2931
case2
k= 36
2932
case2
k= 36
2933
case2
k= 36
2934
case2
k= 36
2935
case2
k= 36
2936
case2
k= 36
2937
case2
k= 36
2938
case2
k= 36
2939
case2
k= 36
2940
case2
k= 36
2941
case2
k= 36
2942
case2
k= 36
2943
case2
k= 36
2944
case2
k= 36
2945
case2
k= 36
2946
case2
k= 36
2947
case2
k= 36
2948
case2
k= 36
2949
case2
k= 

3434
case2
k= 46
3435
case2
k= 46
3436
case2
k= 46
3437
case2
k= 46
3438
case3
k= 46
3439
case2
k= 47
3440
case3
k= 47
3441
case2
k= 48
3442
case2
k= 48
3443
case2
k= 48
3444
case2
k= 48
3445
case3
k= 48
3446
case2
k= 49
3447
case2
k= 49
3448
case2
k= 49
3449
case2
k= 49
3450
case2
k= 49
3451
case3
k= 49
3452
case2
k= 50
3453
case2
k= 50
3454
case2
k= 50
3455
case2
k= 50
3456
case2
k= 50
3457
case2
k= 50
3458
case2
k= 50
3459
case2
k= 50
3460
case2
k= 50
3461
case2
k= 50
3462
case2
k= 50
3463
case2
k= 50
3464
case2
k= 50
3465
case2
k= 50
3466
case2
k= 50
3467
case2
k= 50
3468
case2
k= 50
3469
case2
k= 50
3470
case2
k= 50
3471
case2
k= 50
3472
case2
k= 50
3473
case2
k= 50
3474
case2
k= 50
3475
case2
k= 50
3476
case2
k= 50
3477
case2
k= 50
3478
case2
k= 50
3479
case2
k= 50
3480
case2
k= 50
3481
case2
k= 50
3482
case2
k= 50
3483
case2
k= 50
3484
case2
k= 50
3485
case2
k= 50
3486
case2
k= 50
3487
case2
k= 50
3488
case2
k= 50
3489
case2
k= 50
3490
case2
k= 50
3491
case2
k= 50
3492
case2
k= 

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,TLX V6 Advance,2019,"25,779 mi.",Gasoline,3.5L V6 24V GDI SOHC,9-Speed Automatic,Platinum White Pearl,Ebony,1.0,No,"$30,999"
1,0,MDX w/Technology Package,2023,"3,415 mi.",Gasoline,3.5 Liter SOHC,F,White,Parchment.,0.0,No,"$54,998"
2,0,MDX w/Technology Package,2022,"30,177 mi.",Gasoline,3.5L 24V SOHC I-VTEC V6,2,Majestic Black Pearl,Espresso,0.0,No,"$46,598"
3,0,MDX Sport Hybrid 3.0L w/Technology Package,2019,"56,778 mi.",Hybrid,321.0HP 3.0L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,0.0,Yes,"$41,899"
4,0,TSX Technology,2011,"187,883 mi.",Gasoline,201.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,0.0,Yes,"$5,500"


In [105]:
data.tail()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3891,55,XC90 3.2,2011,"88,300 mi.",Gasoline,235.0HP 3.2L Straight 6 Cylinder Engine Gasoli...,A/T,Blue,Beige,1.0,Yes,"$11,500"
3892,55,XC90 3.2,2008,"196,000 mi.",Gasoline,235.0HP 3.2L Straight 6 Cylinder Engine Gasoli...,A/T,Black,Beige,0.0,Yes,"$6,499"
3893,55,XC90 T6 Inscription,2019,"60,000 mi.",Gasoline,316.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Black,0.0,Yes,"$37,000"
3894,55,XC90 T6 Momentum,2018,"110,380 mi.",Gasoline,316.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Beige,1.0,Yes,"$22,000"
3895,56,ForTwo Pure,2008,"61,595 mi.",Gasoline,70.0HP 1.0L 3 Cylinder Engine Gasoline Fuel,5-Speed A/T,Blue,Gray,0.0,Yes,"$5,000"


In [113]:
# str -> int 로 단위 바꾸기
data['brand'] = data['brand'].astype('int64')
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,TLX V6 Advance,2019,"25,779 mi.",Gasoline,3.5L V6 24V GDI SOHC,9-Speed Automatic,Platinum White Pearl,Ebony,1.0,No,30999
1,0,MDX w/Technology Package,2023,"3,415 mi.",Gasoline,3.5 Liter SOHC,F,White,Parchment.,0.0,No,54998
2,0,MDX w/Technology Package,2022,"30,177 mi.",Gasoline,3.5L 24V SOHC I-VTEC V6,2,Majestic Black Pearl,Espresso,0.0,No,46598
3,0,MDX Sport Hybrid 3.0L w/Technology Package,2019,"56,778 mi.",Hybrid,321.0HP 3.0L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,0.0,Yes,41899
4,0,TSX Technology,2011,"187,883 mi.",Gasoline,201.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,0.0,Yes,5500


In [114]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   int64  
 1   model         3896 non-null   object 
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   object 
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 365.4+ KB
