# 중고차 가격 예측하기

<span style="color:blue"> 환경 준비 </span>

In [1]:
# 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%config InlineBackend.figure_format = 'retina'


In [2]:
# 데이터 읽어오기
path = 'D:\PRACTICE\\used_cars.csv'
data = pd.read_csv(path)

<br/>
<span style="color:blue"> 데이터 이해 </span>

In [3]:
# 상위 몇 개 행 확인
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Ford,Utility Police Interceptor Base,2013,"51,000 mi.",E85 Flex Fuel,300.0HP 3.7L V6 Cylinder Engine Flex Fuel Capa...,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$10,300"
1,Hyundai,Palisade SEL,2021,"34,742 mi.",Gasoline,3.8L V6 24V GDI DOHC,8-Speed Automatic,Moonlight Cloud,Gray,At least 1 accident or damage reported,Yes,"$38,005"
2,Lexus,RX 350 RX 350,2022,"22,372 mi.",Gasoline,3.5 Liter DOHC,Automatic,Blue,Black,None reported,,"$54,598"
3,INFINITI,Q50 Hybrid Sport,2015,"88,900 mi.",Hybrid,354.0HP 3.5L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,None reported,Yes,"$15,500"
4,Audi,Q3 45 S line Premium Plus,2021,"9,835 mi.",Gasoline,2.0L I4 16V GDI DOHC Turbo,8-Speed Automatic,Glacier White Metallic,Black,None reported,,"$34,999"


In [4]:
data.tail()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
4004,Bentley,Continental GT Speed,2023,714 mi.,Gasoline,6.0L W12 48V PDI DOHC Twin Turbo,8-Speed Automatic with Auto-Shift,C / C,Hotspur,None reported,Yes,"$349,950"
4005,Audi,S4 3.0T Premium Plus,2022,"10,900 mi.",Gasoline,349.0HP 3.0L V6 Cylinder Engine Gasoline Fuel,Transmission w/Dual Shift Mode,Black,Black,None reported,Yes,"$53,900"
4006,Porsche,Taycan,2022,"2,116 mi.",,Electric,Automatic,Black,Black,None reported,,"$90,998"
4007,Ford,F-150 Raptor,2020,"33,000 mi.",Gasoline,450.0HP 3.5L V6 Cylinder Engine Gasoline Fuel,A/T,Blue,Black,None reported,Yes,"$62,999"
4008,BMW,X3 xDrive30i,2020,"43,000 mi.",Gasoline,248.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Brown,At least 1 accident or damage reported,Yes,"$40,000"


In [5]:
# 변수 확인
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4009 entries, 0 to 4008
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   brand         4009 non-null   object
 1   model         4009 non-null   object
 2   model_year    4009 non-null   int64 
 3   milage        4009 non-null   object
 4   fuel_type     3839 non-null   object
 5   engine        4009 non-null   object
 6   transmission  4009 non-null   object
 7   ext_col       4009 non-null   object
 8   int_col       4009 non-null   object
 9   accident      3896 non-null   object
 10  clean_title   3413 non-null   object
 11  price         4009 non-null   object
dtypes: int64(1), object(11)
memory usage: 376.0+ KB


* **Brand & Model:** Identify the brand or company name along with the specific model of each vehicle.
* **Model Year:** Discover the manufacturing year of the vehicles, crucial for assessing depreciation and technology advancements.
* **Mileage:** Obtain the mileage of each vehicle, a key indicator of wear and tear and potential maintenance requirements.
* **Fuel Type:** Learn about the type of fuel the vehicles run on, whether it's gasoline, diesel, electric, or hybrid.
* **Engine Type:** Understand the engine specifications, shedding light on performance and efficiency.
* **Transmission:** Determine the transmission type, whether automatic, manual, or another variant.
* **Exterior & Interior Colors:** Explore the aesthetic aspects of the vehicles, including exterior and interior color options.
* **Accident History:** Discover whether a vehicle has a prior history of accidents or damage, crucial for informed decision-making.
* **Clean Title:** Evaluate the availability of a clean title, which can impact the vehicle's resale value and legal status.
* **Price:** Access the listed prices for each vehicle, aiding in price comparison and budgeting

<br/>
<span style="color:blue"> 데이터 준비 </span>

### 결측치 처리

In [6]:
data.isna().sum()

brand             0
model             0
model_year        0
milage            0
fuel_type       170
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64

**fuel_type 결측치 처리**

model 값이 동일한 행의 fuel_type 값을 넣어줌

In [7]:
# 모델을 기준으로 정렬
data = data.sort_values('model')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
704,FIAT,124 Spider Abarth,2017,"45,000 mi.",Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,At least 1 accident or damage reported,Yes,"$22,500"
2900,BMW,128 i,2013,"67,874 mi.",Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,None reported,Yes,"$18,000"
509,BMW,135 i,2008,"87,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,None reported,Yes,"$15,300"
1507,BMW,135 i,2011,"132,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,None reported,Yes,"$11,000"
715,BMW,135 i,2009,"72,900 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,None reported,Yes,"$15,500"
...,...,...,...,...,...,...,...,...,...,...,...,...
3855,Scion,tC Anniversary Edition,2014,"99,999 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,White,Black,None reported,Yes,"$10,998"
1284,Scion,tC Anniversary Edition,2014,"115,000 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Silver,Silver,None reported,Yes,"$11,495"
177,Scion,tC Base,2013,"177,600 mi.",Gasoline,180.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,Silver,Black,At least 1 accident or damage reported,Yes,"$7,100"
690,Scion,tC Release Series 6.0,2010,"120,010 mi.",Gasoline,161.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,4-Speed A/T,Gray,Black,At least 1 accident or damage reported,Yes,"$6,500"


In [8]:
df = data.groupby('model', as_index = False)[['fuel_type']].count()
df

Unnamed: 0,model,fuel_type
0,124 Spider Abarth,1
1,128 i,1
2,135 i,3
3,135 is,1
4,1500 Big Horn,11
...,...,...
1893,i8 Base,5
1894,tC Anniversary Edition,2
1895,tC Base,1
1896,tC Release Series 6.0,1


In [9]:
# fuel_type 값이 NaN인데 동일한 model 값도 존재하지 않는 경우
df[df['fuel_type'] == 0]

Unnamed: 0,model,fuel_type
18,2 Launch Edition,0
95,500e Battery Electric,0
258,Air Grand Touring,0
259,Air Pure,0
309,Bolt EUV Premier,0
...,...,...
1887,e-tron Premium,0
1888,e-tron Prestige,0
1890,i3 94 Ah,0
1891,i3 Base,0


In [10]:
type(df)

pandas.core.frame.DataFrame

In [11]:
# fuel_type 열의 최빈값
data['fuel_type'].mode()

0    Gasoline
dtype: object

In [12]:
for index, row in data.iterrows() :
    if (type(row.fuel_type) == str) and (len(row.fuel_type) >= 2) :
        continue
        
    else :  
        d = df.loc[df['model'] == row.model]
        
        if (d['fuel_type'].any() != 0) :
            try : 
                data.fuel_type.fillna(method = 'bfill', limit = 1, inplace = True)
            except :
                continue
                
        else :
            try :
                data.fuel_type.fillna('Gasoline', inplace = True)
            except :
                continue

data.isna().sum()

brand             0
model             0
model_year        0
milage            0
fuel_type         0
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64

In [13]:
data = data.sort_values('fuel_type')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
1940,Ford,F-250 Lariat,2019,"85,000 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,Black,Black,At least 1 accident or damage reported,Yes,"$52,500"
1059,Chevrolet,Silverado 3500 High Country,2015,"77,500 mi.",Diesel,397.0HP 6.6L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,Black,Brown,None reported,Yes,"$54,000"
1281,Chevrolet,Silverado 3500 LTZ,2022,"85,200 mi.",Diesel,445.0HP 6.6L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Gray,None reported,Yes,"$67,500"
1635,Chevrolet,Silverado 3500 LTZ,2018,"92,149 mi.",Diesel,6.6L V8 32V DDI OHV Turbo Diesel,6-Speed Automatic,Black,Jet Black,,,"$52,889"
502,Chevrolet,Express 3500 LT,2016,"120,000 mi.",Diesel,260.0HP 6.6L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Gray,None reported,Yes,"$19,500"
...,...,...,...,...,...,...,...,...,...,...,...,...
2303,Acura,NSX Base,1993,"75,980 mi.",–,–,A/T,Silver,Black,None reported,Yes,"$90,200"
2103,Volvo,850 Turbo,1995,"94,000 mi.",–,–,A/T,White,Black,None reported,Yes,"$4,500"
855,Ford,Bronco,1974,"6,217 mi.",–,–,–,Dark Gray Metallic,–,None reported,Yes,"$115,000"
3213,Ford,Mustang EcoBoost Premium,2019,"31,000 mi.",–,–,6-Speed M/T,Gray,Black,None reported,Yes,"$34,700"


In [14]:
data['fuel_type'].unique()

array(['Diesel', 'E85 Flex Fuel', 'Gasoline', 'Hybrid', 'Plug-In Hybrid',
       'not supported', '–'], dtype=object)

In [15]:
data = data.sort_values('model')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
704,FIAT,124 Spider Abarth,2017,"45,000 mi.",Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,At least 1 accident or damage reported,Yes,"$22,500"
2900,BMW,128 i,2013,"67,874 mi.",Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,None reported,Yes,"$18,000"
715,BMW,135 i,2009,"72,900 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,None reported,Yes,"$15,500"
1507,BMW,135 i,2011,"132,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,None reported,Yes,"$11,000"
509,BMW,135 i,2008,"87,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,None reported,Yes,"$15,300"
...,...,...,...,...,...,...,...,...,...,...,...,...
3855,Scion,tC Anniversary Edition,2014,"99,999 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,White,Black,None reported,Yes,"$10,998"
1284,Scion,tC Anniversary Edition,2014,"115,000 mi.",Gasoline,179.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Silver,Silver,None reported,Yes,"$11,495"
177,Scion,tC Base,2013,"177,600 mi.",Gasoline,180.0HP 2.5L 4 Cylinder Engine Gasoline Fuel,6-Speed A/T,Silver,Black,At least 1 accident or damage reported,Yes,"$7,100"
690,Scion,tC Release Series 6.0,2010,"120,010 mi.",Gasoline,161.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,4-Speed A/T,Gray,Black,At least 1 accident or damage reported,Yes,"$6,500"


In [16]:
# '–'로 잘못 표기된 NaN 값들을 NaN으로 바꿔주기
for index, row in data.iterrows() :
    if (type(row.fuel_type) == str) and (len(row.fuel_type) >= 2) :
        continue
        
    else :  
        data.loc[index, 'fuel_type'] = np.NaN

In [17]:
data.isna().sum()

brand             0
model             0
model_year        0
milage            0
fuel_type        45
engine            0
transmission      0
ext_col           0
int_col           0
accident        113
clean_title     596
price             0
dtype: int64

In [18]:
# NaN 값들을 다시 한 번 bfill로 채워줌
for index, row in data.iterrows() :
    if (type(row.fuel_type) == str) and (len(row.fuel_type) >= 2) :
        continue
        
    else :
        try : 
            data.fuel_type.fillna(method = 'bfill', limit = 1, inplace = True)
        except :
            continue

In [19]:
data = data.sort_values('fuel_type')
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3292,Mercedes-Benz,Sprinter 2500 Standard Roof,2021,"13,344 mi.",Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,None reported,Yes,"$56,500"
3550,Ford,F-350 King Ranch,2015,"92,421 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Brown,None reported,Yes,"$51,900"
3708,Ford,F-350 Lariat,2018,"66,281 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,None reported,Yes,"$63,500"
3261,Ford,F-350 Lariat,2015,"162,000 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Red,Beige,None reported,Yes,"$32,000"
2170,Ford,F-350 Lariat,2021,"28,220 mi.",Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,None reported,Yes,"$73,600"
...,...,...,...,...,...,...,...,...,...,...,...,...
3825,Toyota,Prius Plug-in Base,2014,"106,000 mi.",Plug-In Hybrid,134.0HP 1.8L 4 Cylinder Engine Plug-In Electri...,A/T,Green,Beige,At least 1 accident or damage reported,Yes,"$11,800"
1820,Volvo,XC90 Recharge Plug-In Hybrid T8 Inscription 7 ...,2022,"7,800 mi.",Plug-In Hybrid,455.0HP 2.0L 4 Cylinder Engine Plug-In Electri...,8-Speed A/T,Black,Beige,None reported,Yes,"$66,000"
1509,Hyundai,IONIQ 5 SE,2022,"18,500 mi.",Plug-In Hybrid,320.0HP Electric Motor Electric Fuel System,A/T,White,Black,None reported,Yes,"$42,000"
3700,Toyota,Mirai Base,2016,"40,000 mi.",not supported,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,None reported,Yes,"$9,500"


In [20]:
data['fuel_type'].unique()

array(['Diesel', 'E85 Flex Fuel', 'Gasoline', 'Hybrid', 'Plug-In Hybrid',
       'not supported'], dtype=object)

----------

**accident 결측치 처리**

결측치인 행 제외

In [21]:
# 값 확인
data['accident'].unique()

array(['None reported', 'At least 1 accident or damage reported', nan],
      dtype=object)

In [22]:
# accident 값을 숫자로 바꿔주기 (가변수화)
data = data.replace({'accident': 'None reported'}, 0)
data = data.replace({'accident': 'At least 1 accident or damage reported'}, 1)
data

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3292,Mercedes-Benz,Sprinter 2500 Standard Roof,2021,"13,344 mi.",Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,"$56,500"
3550,Ford,F-350 King Ranch,2015,"92,421 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Brown,0.0,Yes,"$51,900"
3708,Ford,F-350 Lariat,2018,"66,281 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,"$63,500"
3261,Ford,F-350 Lariat,2015,"162,000 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Red,Beige,0.0,Yes,"$32,000"
2170,Ford,F-350 Lariat,2021,"28,220 mi.",Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,"$73,600"
...,...,...,...,...,...,...,...,...,...,...,...,...
3825,Toyota,Prius Plug-in Base,2014,"106,000 mi.",Plug-In Hybrid,134.0HP 1.8L 4 Cylinder Engine Plug-In Electri...,A/T,Green,Beige,1.0,Yes,"$11,800"
1820,Volvo,XC90 Recharge Plug-In Hybrid T8 Inscription 7 ...,2022,"7,800 mi.",Plug-In Hybrid,455.0HP 2.0L 4 Cylinder Engine Plug-In Electri...,8-Speed A/T,Black,Beige,0.0,Yes,"$66,000"
1509,Hyundai,IONIQ 5 SE,2022,"18,500 mi.",Plug-In Hybrid,320.0HP Electric Motor Electric Fuel System,A/T,White,Black,0.0,Yes,"$42,000"
3700,Toyota,Mirai Base,2016,"40,000 mi.",not supported,151.0HP Electric Motor Hydrogen Fuel,A/T,Silver,Black,0.0,Yes,"$9,500"


In [23]:
data['accident'].value_counts()

0.0    2910
1.0     986
Name: accident, dtype: int64

In [24]:
# accident가 결측치인 경우는 제외하기로 함
data = data.loc[data['accident'].notnull()]
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3896 entries, 3292 to 2894
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   object 
 1   model         3896 non-null   object 
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   object 
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3413 non-null   object 
 11  price         3896 non-null   object 
dtypes: float64(1), int64(1), object(10)
memory usage: 395.7+ KB


-------

**clean_title 결측치 처리**

결측치를 'No'로 변경

In [25]:
data['clean_title'].value_counts()

Yes    3413
Name: clean_title, dtype: int64

In [26]:
data['clean_title'].unique()

array(['Yes', nan], dtype=object)

In [27]:
# 결측치를 No로 변경
data.clean_title.fillna('No', inplace = True)
data['clean_title'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


Yes    3413
No      483
Name: clean_title, dtype: int64

------

### 가변수화

In [28]:
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3292,Mercedes-Benz,Sprinter 2500 Standard Roof,2021,"13,344 mi.",Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,"$56,500"
3550,Ford,F-350 King Ranch,2015,"92,421 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Brown,0.0,Yes,"$51,900"
3708,Ford,F-350 Lariat,2018,"66,281 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,"$63,500"
3261,Ford,F-350 Lariat,2015,"162,000 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Red,Beige,0.0,Yes,"$32,000"
2170,Ford,F-350 Lariat,2021,"28,220 mi.",Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,"$73,600"


**price 가변수화**

In [29]:
# $ 지우기
for index, row in data.iterrows():
    data.loc[index, 'price'] = data.loc[index, 'price'].strip("$")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(loc, value)


In [30]:
# , 지우기
for index, row in data.iterrows():
    data.loc[index, 'price'] = data.loc[index, 'price'].replace(',', '')

In [31]:
# str -> int 로 단위 바꾸기
data['price'] = data['price'].astype('int64')
data.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['price'] = data['price'].astype('int64')


Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3292,Mercedes-Benz,Sprinter 2500 Standard Roof,2021,"13,344 mi.",Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,56500
3550,Ford,F-350 King Ranch,2015,"92,421 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,6-Speed A/T,White,Brown,0.0,Yes,51900
3708,Ford,F-350 Lariat,2018,"66,281 mi.",Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,63500
3261,Ford,F-350 Lariat,2015,"162,000 mi.",Diesel,440.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Red,Beige,0.0,Yes,32000
2170,Ford,F-350 Lariat,2021,"28,220 mi.",Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,73600


In [32]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3896 entries, 3292 to 2894
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   object 
 1   model         3896 non-null   object 
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   object 
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(2), object(9)
memory usage: 555.7+ KB


**brand 가변수화**

In [33]:
data['brand'].value_counts()

Ford             372
BMW              367
Mercedes-Benz    310
Chevrolet        285
Toyota           196
Audi             194
Porsche          186
Lexus            161
Jeep             138
Land             126
Nissan           114
Cadillac         104
RAM               90
Dodge             89
GMC               88
Tesla             87
Kia               75
Hyundai           68
Subaru            63
Acura             63
Mazda             62
Honda             60
Volkswagen        59
INFINITI          56
Lincoln           50
Jaguar            46
Volvo             37
Maserati          33
MINI              32
Bentley           31
Buick             30
Chrysler          27
Lamborghini       26
Mitsubishi        20
Genesis           19
Alfa              16
Hummer            16
Rivian            15
Pontiac           15
Ferrari           12
Rolls-Royce       10
Aston              8
Scion              6
McLaren            6
Saturn             5
FIAT               5
Mercury            3
Lotus        

In [34]:
data = data.sort_values('brand')

data.reset_index(drop = True, inplace = True)
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,Acura,TLX V6 Advance,2019,"25,779 mi.",Gasoline,3.5L V6 24V GDI SOHC,9-Speed Automatic,Platinum White Pearl,Ebony,1.0,No,30999
1,Acura,MDX w/Technology Package,2023,"3,415 mi.",Gasoline,3.5 Liter SOHC,F,White,Parchment.,0.0,No,54998
2,Acura,MDX w/Technology Package,2022,"30,177 mi.",Gasoline,3.5L 24V SOHC I-VTEC V6,2,Majestic Black Pearl,Espresso,0.0,No,46598
3,Acura,MDX Sport Hybrid 3.0L w/Technology Package,2019,"56,778 mi.",Hybrid,321.0HP 3.0L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,0.0,Yes,41899
4,Acura,TSX Technology,2011,"187,883 mi.",Gasoline,201.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,0.0,Yes,5500


In [35]:
len(data)

3896

In [36]:
data['brand'].value_counts()

Ford             372
BMW              367
Mercedes-Benz    310
Chevrolet        285
Toyota           196
Audi             194
Porsche          186
Lexus            161
Jeep             138
Land             126
Nissan           114
Cadillac         104
RAM               90
Dodge             89
GMC               88
Tesla             87
Kia               75
Hyundai           68
Subaru            63
Acura             63
Mazda             62
Honda             60
Volkswagen        59
INFINITI          56
Lincoln           50
Jaguar            46
Volvo             37
Maserati          33
MINI              32
Bentley           31
Buick             30
Chrysler          27
Lamborghini       26
Mitsubishi        20
Genesis           19
Alfa              16
Hummer            16
Rivian            15
Pontiac           15
Ferrari           12
Rolls-Royce       10
Aston              8
McLaren            6
Scion              6
Saturn             5
FIAT               5
Mercury            3
Lotus        

In [37]:
k = 0
for i in range(0, len(data)) :
    print(i)
    if(i + 1 >= len(data)) :
        data.iloc[i, 0] = k
        break
    
    else :
        if(data.iloc[i, 0] == data.iloc[i + 1, 0]) :
            print('case2')
            print('k=', k)
            data.iloc[i, 0] = k
        
        else :
            print('case3')
            print('k=', k)
            data.iloc[i, 0] = k
            k = k + 1

data.head()

0
case2
k= 0
1
case2
k= 0
2
case2
k= 0
3
case2
k= 0
4
case2
k= 0
5
case2
k= 0
6
case2
k= 0
7
case2
k= 0
8
case2
k= 0
9
case2
k= 0
10
case2
k= 0
11
case2
k= 0
12
case2
k= 0
13
case2
k= 0
14
case2
k= 0
15
case2
k= 0
16
case2
k= 0
17
case2
k= 0
18
case2
k= 0
19
case2
k= 0
20
case2
k= 0
21
case2
k= 0
22
case2
k= 0
23
case2
k= 0
24
case2
k= 0
25
case2
k= 0
26
case2
k= 0
27
case2
k= 0
28
case2
k= 0
29
case2
k= 0
30
case2
k= 0
31
case2
k= 0
32
case2
k= 0
33
case2
k= 0
34
case2
k= 0
35
case2
k= 0
36
case2
k= 0
37
case2
k= 0
38
case2
k= 0
39
case2
k= 0
40
case2
k= 0
41
case2
k= 0
42
case2
k= 0
43
case2
k= 0
44
case2
k= 0
45
case2
k= 0
46
case2
k= 0
47
case2
k= 0
48
case2
k= 0
49
case2
k= 0
50
case2
k= 0
51
case2
k= 0
52
case2
k= 0
53
case2
k= 0
54
case2
k= 0
55
case2
k= 0
56
case2
k= 0
57
case2
k= 0
58
case2
k= 0
59
case2
k= 0
60
case2
k= 0
61
case2
k= 0
62
case3
k= 0
63
case2
k= 1
64
case2
k= 1
65
case2
k= 1
66
case2
k= 1
67
case2
k= 1
68
case2
k= 1
69
case2
k= 1
70
case2
k= 1
71
case2
k= 1
72

626
case2
k= 4
627
case2
k= 4
628
case2
k= 4
629
case2
k= 4
630
case2
k= 4
631
case2
k= 4
632
case2
k= 4
633
case2
k= 4
634
case2
k= 4
635
case2
k= 4
636
case2
k= 4
637
case2
k= 4
638
case2
k= 4
639
case2
k= 4
640
case2
k= 4
641
case2
k= 4
642
case2
k= 4
643
case2
k= 4
644
case2
k= 4
645
case2
k= 4
646
case2
k= 4
647
case3
k= 4
648
case2
k= 5
649
case2
k= 5
650
case2
k= 5
651
case2
k= 5
652
case2
k= 5
653
case2
k= 5
654
case2
k= 5
655
case2
k= 5
656
case2
k= 5
657
case2
k= 5
658
case2
k= 5
659
case2
k= 5
660
case2
k= 5
661
case2
k= 5
662
case2
k= 5
663
case2
k= 5
664
case2
k= 5
665
case2
k= 5
666
case2
k= 5
667
case2
k= 5
668
case2
k= 5
669
case2
k= 5
670
case2
k= 5
671
case2
k= 5
672
case2
k= 5
673
case2
k= 5
674
case2
k= 5
675
case2
k= 5
676
case2
k= 5
677
case2
k= 5
678
case3
k= 5
679
case3
k= 6
680
case2
k= 7
681
case2
k= 7
682
case2
k= 7
683
case2
k= 7
684
case2
k= 7
685
case2
k= 7
686
case2
k= 7
687
case2
k= 7
688
case2
k= 7
689
case2
k= 7
690
case2
k= 7
691
case2
k= 7
692
case2


1235
case2
k= 14
1236
case2
k= 14
1237
case2
k= 14
1238
case2
k= 14
1239
case2
k= 14
1240
case2
k= 14
1241
case2
k= 14
1242
case2
k= 14
1243
case2
k= 14
1244
case2
k= 14
1245
case2
k= 14
1246
case2
k= 14
1247
case2
k= 14
1248
case2
k= 14
1249
case2
k= 14
1250
case2
k= 14
1251
case2
k= 14
1252
case2
k= 14
1253
case2
k= 14
1254
case2
k= 14
1255
case2
k= 14
1256
case2
k= 14
1257
case2
k= 14
1258
case2
k= 14
1259
case2
k= 14
1260
case2
k= 14
1261
case2
k= 14
1262
case2
k= 14
1263
case2
k= 14
1264
case2
k= 14
1265
case2
k= 14
1266
case2
k= 14
1267
case2
k= 14
1268
case2
k= 14
1269
case2
k= 14
1270
case2
k= 14
1271
case2
k= 14
1272
case2
k= 14
1273
case2
k= 14
1274
case2
k= 14
1275
case2
k= 14
1276
case2
k= 14
1277
case2
k= 14
1278
case2
k= 14
1279
case2
k= 14
1280
case2
k= 14
1281
case2
k= 14
1282
case2
k= 14
1283
case2
k= 14
1284
case2
k= 14
1285
case2
k= 14
1286
case2
k= 14
1287
case2
k= 14
1288
case2
k= 14
1289
case2
k= 14
1290
case2
k= 14
1291
case2
k= 14
1292
case2
k= 14
1293
case2
k= 

case2
k= 17
1768
case2
k= 17
1769
case2
k= 17
1770
case3
k= 17
1771
case2
k= 18
1772
case2
k= 18
1773
case2
k= 18
1774
case2
k= 18
1775
case2
k= 18
1776
case2
k= 18
1777
case2
k= 18
1778
case2
k= 18
1779
case2
k= 18
1780
case2
k= 18
1781
case2
k= 18
1782
case2
k= 18
1783
case2
k= 18
1784
case2
k= 18
1785
case2
k= 18
1786
case3
k= 18
1787
case2
k= 19
1788
case2
k= 19
1789
case2
k= 19
1790
case2
k= 19
1791
case2
k= 19
1792
case2
k= 19
1793
case2
k= 19
1794
case2
k= 19
1795
case2
k= 19
1796
case2
k= 19
1797
case2
k= 19
1798
case2
k= 19
1799
case2
k= 19
1800
case2
k= 19
1801
case2
k= 19
1802
case2
k= 19
1803
case2
k= 19
1804
case2
k= 19
1805
case2
k= 19
1806
case2
k= 19
1807
case2
k= 19
1808
case2
k= 19
1809
case2
k= 19
1810
case2
k= 19
1811
case2
k= 19
1812
case2
k= 19
1813
case2
k= 19
1814
case2
k= 19
1815
case2
k= 19
1816
case2
k= 19
1817
case2
k= 19
1818
case2
k= 19
1819
case2
k= 19
1820
case2
k= 19
1821
case2
k= 19
1822
case2
k= 19
1823
case2
k= 19
1824
case2
k= 19
1825
case2
k= 19
18

k= 27
2326
case2
k= 27
2327
case2
k= 27
2328
case2
k= 27
2329
case2
k= 27
2330
case2
k= 27
2331
case2
k= 27
2332
case2
k= 27
2333
case2
k= 27
2334
case2
k= 27
2335
case2
k= 27
2336
case2
k= 27
2337
case2
k= 27
2338
case2
k= 27
2339
case2
k= 27
2340
case2
k= 27
2341
case2
k= 27
2342
case2
k= 27
2343
case2
k= 27
2344
case2
k= 27
2345
case2
k= 27
2346
case2
k= 27
2347
case2
k= 27
2348
case2
k= 27
2349
case2
k= 27
2350
case2
k= 27
2351
case2
k= 27
2352
case2
k= 27
2353
case2
k= 27
2354
case2
k= 27
2355
case2
k= 27
2356
case2
k= 27
2357
case2
k= 27
2358
case2
k= 27
2359
case2
k= 27
2360
case2
k= 27
2361
case2
k= 27
2362
case2
k= 27
2363
case2
k= 27
2364
case2
k= 27
2365
case2
k= 27
2366
case2
k= 27
2367
case2
k= 27
2368
case2
k= 27
2369
case2
k= 27
2370
case2
k= 27
2371
case2
k= 27
2372
case2
k= 27
2373
case2
k= 27
2374
case2
k= 27
2375
case2
k= 27
2376
case2
k= 27
2377
case2
k= 27
2378
case2
k= 27
2379
case2
k= 27
2380
case2
k= 27
2381
case2
k= 27
2382
case2
k= 27
2383
case2
k= 27
2384
cas

2933
case2
k= 36
2934
case2
k= 36
2935
case2
k= 36
2936
case2
k= 36
2937
case2
k= 36
2938
case2
k= 36
2939
case2
k= 36
2940
case2
k= 36
2941
case2
k= 36
2942
case2
k= 36
2943
case2
k= 36
2944
case2
k= 36
2945
case2
k= 36
2946
case2
k= 36
2947
case2
k= 36
2948
case2
k= 36
2949
case2
k= 36
2950
case2
k= 36
2951
case2
k= 36
2952
case2
k= 36
2953
case2
k= 36
2954
case2
k= 36
2955
case2
k= 36
2956
case2
k= 36
2957
case2
k= 36
2958
case2
k= 36
2959
case2
k= 36
2960
case2
k= 36
2961
case2
k= 36
2962
case2
k= 36
2963
case2
k= 36
2964
case2
k= 36
2965
case2
k= 36
2966
case2
k= 36
2967
case2
k= 36
2968
case2
k= 36
2969
case2
k= 36
2970
case2
k= 36
2971
case2
k= 36
2972
case2
k= 36
2973
case2
k= 36
2974
case2
k= 36
2975
case2
k= 36
2976
case2
k= 36
2977
case2
k= 36
2978
case2
k= 36
2979
case2
k= 36
2980
case2
k= 36
2981
case2
k= 36
2982
case2
k= 36
2983
case3
k= 36
2984
case2
k= 37
2985
case2
k= 37
2986
case3
k= 37
2987
case2
k= 38
2988
case2
k= 38
2989
case2
k= 38
2990
case2
k= 38
2991
case2
k= 

k= 50
3464
case2
k= 50
3465
case2
k= 50
3466
case2
k= 50
3467
case2
k= 50
3468
case2
k= 50
3469
case2
k= 50
3470
case2
k= 50
3471
case2
k= 50
3472
case2
k= 50
3473
case2
k= 50
3474
case2
k= 50
3475
case2
k= 50
3476
case2
k= 50
3477
case2
k= 50
3478
case2
k= 50
3479
case2
k= 50
3480
case2
k= 50
3481
case2
k= 50
3482
case2
k= 50
3483
case2
k= 50
3484
case2
k= 50
3485
case2
k= 50
3486
case2
k= 50
3487
case2
k= 50
3488
case2
k= 50
3489
case2
k= 50
3490
case2
k= 50
3491
case2
k= 50
3492
case2
k= 50
3493
case2
k= 50
3494
case2
k= 50
3495
case2
k= 50
3496
case2
k= 50
3497
case2
k= 50
3498
case2
k= 50
3499
case2
k= 50
3500
case2
k= 50
3501
case2
k= 50
3502
case2
k= 50
3503
case2
k= 50
3504
case2
k= 50
3505
case2
k= 50
3506
case2
k= 50
3507
case2
k= 50
3508
case2
k= 50
3509
case2
k= 50
3510
case2
k= 50
3511
case2
k= 50
3512
case2
k= 50
3513
case2
k= 50
3514
case3
k= 50
3515
case3
k= 51
3516
case2
k= 52
3517
case2
k= 52
3518
case2
k= 52
3519
case2
k= 52
3520
case2
k= 52
3521
case2
k= 52
3522
cas

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,TLX V6 Advance,2019,"25,779 mi.",Gasoline,3.5L V6 24V GDI SOHC,9-Speed Automatic,Platinum White Pearl,Ebony,1.0,No,30999
1,0,MDX w/Technology Package,2023,"3,415 mi.",Gasoline,3.5 Liter SOHC,F,White,Parchment.,0.0,No,54998
2,0,MDX w/Technology Package,2022,"30,177 mi.",Gasoline,3.5L 24V SOHC I-VTEC V6,2,Majestic Black Pearl,Espresso,0.0,No,46598
3,0,MDX Sport Hybrid 3.0L w/Technology Package,2019,"56,778 mi.",Hybrid,321.0HP 3.0L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,0.0,Yes,41899
4,0,TSX Technology,2011,"187,883 mi.",Gasoline,201.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,0.0,Yes,5500


In [38]:
data.tail()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
3891,55,XC90 3.2,2011,"88,300 mi.",Gasoline,235.0HP 3.2L Straight 6 Cylinder Engine Gasoli...,A/T,Blue,Beige,1.0,Yes,11500
3892,55,XC90 3.2,2008,"196,000 mi.",Gasoline,235.0HP 3.2L Straight 6 Cylinder Engine Gasoli...,A/T,Black,Beige,0.0,Yes,6499
3893,55,XC90 T6 Inscription,2019,"60,000 mi.",Gasoline,316.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,8-Speed A/T,White,Black,0.0,Yes,37000
3894,55,XC90 T6 Momentum,2018,"110,380 mi.",Gasoline,316.0HP 2.0L 4 Cylinder Engine Gasoline Fuel,A/T,Gray,Beige,1.0,Yes,22000
3895,56,ForTwo Pure,2008,"61,595 mi.",Gasoline,70.0HP 1.0L 3 Cylinder Engine Gasoline Fuel,5-Speed A/T,Blue,Gray,0.0,Yes,5000


In [39]:
# str -> int 로 단위 바꾸기
data['brand'] = data['brand'].astype('int64')
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,0,TLX V6 Advance,2019,"25,779 mi.",Gasoline,3.5L V6 24V GDI SOHC,9-Speed Automatic,Platinum White Pearl,Ebony,1.0,No,30999
1,0,MDX w/Technology Package,2023,"3,415 mi.",Gasoline,3.5 Liter SOHC,F,White,Parchment.,0.0,No,54998
2,0,MDX w/Technology Package,2022,"30,177 mi.",Gasoline,3.5L 24V SOHC I-VTEC V6,2,Majestic Black Pearl,Espresso,0.0,No,46598
3,0,MDX Sport Hybrid 3.0L w/Technology Package,2019,"56,778 mi.",Hybrid,321.0HP 3.0L V6 Cylinder Engine Gas/Electric H...,7-Speed A/T,Black,Black,0.0,Yes,41899
4,0,TSX Technology,2011,"187,883 mi.",Gasoline,201.0HP 2.4L 4 Cylinder Engine Gasoline Fuel,A/T,Red,Beige,0.0,Yes,5500


In [40]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   int64  
 1   model         3896 non-null   object 
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   object 
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 365.4+ KB


**model 가변수화**

In [41]:
data['model'].value_counts()

M3 Base                    29
F-150 XLT                  23
Corvette Base              22
1500 Laramie               18
Camaro 2SS                 17
                           ..
MKS Base                    1
E350 Super Duty Base        1
Sierra 1500 SL Crew Cab     1
Patriot Latitude            1
G80 2.5T                    1
Name: model, Length: 1862, dtype: int64

In [42]:
data = data.sort_values('model')

data.reset_index(drop = True, inplace = True)
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,12,124 Spider Abarth,2017,"45,000 mi.",Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,1.0,Yes,22500
1,4,128 i,2013,"67,874 mi.",Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,0.0,Yes,18000
2,4,135 i,2011,"132,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,0.0,Yes,11000
3,4,135 i,2009,"72,900 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,0.0,Yes,15500
4,4,135 i,2008,"87,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,0.0,Yes,15300


In [43]:
k = 0
for i in range(0, len(data)) :
    print(i)
    if(i + 1 >= len(data)) :
        data.iloc[i, 1] = k
        break
    
    else :
        if(data.iloc[i, 1] == data.iloc[i + 1, 1]) :
            print('case2')
            print('k=', k)
            data.iloc[i, 1] = k
        
        else :
            print('case3')
            print('k=', k)
            data.iloc[i, 1] = k
            k = k + 1

data.head()

0
case3
k= 0
1
case3
k= 1
2
case2
k= 2
3
case2
k= 2
4
case3
k= 2
5
case3
k= 3
6
case2
k= 4
7
case2
k= 4
8
case2
k= 4
9
case2
k= 4
10
case2
k= 4
11
case2
k= 4
12
case2
k= 4
13
case2
k= 4
14
case2
k= 4
15
case2
k= 4
16
case3
k= 4
17
case3
k= 5
18
case3
k= 6
19
case3
k= 7
20
case3
k= 8
21
case2
k= 9
22
case3
k= 9
23
case2
k= 10
24
case2
k= 10
25
case2
k= 10
26
case2
k= 10
27
case2
k= 10
28
case2
k= 10
29
case2
k= 10
30
case2
k= 10
31
case2
k= 10
32
case2
k= 10
33
case2
k= 10
34
case2
k= 10
35
case2
k= 10
36
case2
k= 10
37
case2
k= 10
38
case2
k= 10
39
case2
k= 10
40
case3
k= 10
41
case2
k= 11
42
case2
k= 11
43
case2
k= 11
44
case3
k= 11
45
case2
k= 12
46
case3
k= 12
47
case2
k= 13
48
case2
k= 13
49
case2
k= 13
50
case2
k= 13
51
case3
k= 13
52
case2
k= 14
53
case2
k= 14
54
case3
k= 14
55
case2
k= 15
56
case3
k= 15
57
case2
k= 16
58
case3
k= 16
59
case2
k= 17
60
case2
k= 17
61
case3
k= 17
62
case3
k= 18
63
case2
k= 19
64
case3
k= 19
65
case3
k= 20
66
case2
k= 21
67
case3
k= 21
68
case3
k= 2

k= 235
518
case2
k= 236
519
case3
k= 236
520
case3
k= 237
521
case2
k= 238
522
case3
k= 238
523
case2
k= 239
524
case2
k= 239
525
case3
k= 239
526
case2
k= 240
527
case2
k= 240
528
case3
k= 240
529
case3
k= 241
530
case2
k= 242
531
case3
k= 242
532
case2
k= 243
533
case2
k= 243
534
case2
k= 243
535
case3
k= 243
536
case3
k= 244
537
case3
k= 245
538
case2
k= 246
539
case3
k= 246
540
case3
k= 247
541
case3
k= 248
542
case2
k= 249
543
case2
k= 249
544
case3
k= 249
545
case3
k= 250
546
case3
k= 251
547
case3
k= 252
548
case3
k= 253
549
case2
k= 254
550
case3
k= 254
551
case2
k= 255
552
case2
k= 255
553
case3
k= 255
554
case2
k= 256
555
case3
k= 256
556
case3
k= 257
557
case2
k= 258
558
case3
k= 258
559
case2
k= 259
560
case2
k= 259
561
case2
k= 259
562
case2
k= 259
563
case3
k= 259
564
case2
k= 260
565
case2
k= 260
566
case3
k= 260
567
case3
k= 261
568
case2
k= 262
569
case3
k= 262
570
case3
k= 263
571
case3
k= 264
572
case3
k= 265
573
case3
k= 266
574
case3
k= 267
575
case2
k= 268
576
cas

k= 494
1042
case2
k= 494
1043
case2
k= 494
1044
case3
k= 494
1045
case2
k= 495
1046
case2
k= 495
1047
case3
k= 495
1048
case2
k= 496
1049
case3
k= 496
1050
case3
k= 497
1051
case2
k= 498
1052
case2
k= 498
1053
case2
k= 498
1054
case2
k= 498
1055
case2
k= 498
1056
case3
k= 498
1057
case2
k= 499
1058
case2
k= 499
1059
case2
k= 499
1060
case2
k= 499
1061
case2
k= 499
1062
case2
k= 499
1063
case2
k= 499
1064
case2
k= 499
1065
case2
k= 499
1066
case2
k= 499
1067
case3
k= 499
1068
case2
k= 500
1069
case2
k= 500
1070
case2
k= 500
1071
case2
k= 500
1072
case2
k= 500
1073
case2
k= 500
1074
case3
k= 500
1075
case2
k= 501
1076
case2
k= 501
1077
case2
k= 501
1078
case2
k= 501
1079
case2
k= 501
1080
case2
k= 501
1081
case3
k= 501
1082
case3
k= 502
1083
case3
k= 503
1084
case3
k= 504
1085
case3
k= 505
1086
case2
k= 506
1087
case2
k= 506
1088
case2
k= 506
1089
case2
k= 506
1090
case3
k= 506
1091
case3
k= 507
1092
case2
k= 508
1093
case3
k= 508
1094
case3
k= 509
1095
case2
k= 510
1096
case3
k= 510
109

1563
case2
k= 703
1564
case3
k= 703
1565
case3
k= 704
1566
case2
k= 705
1567
case3
k= 705
1568
case3
k= 706
1569
case3
k= 707
1570
case3
k= 708
1571
case3
k= 709
1572
case3
k= 710
1573
case2
k= 711
1574
case2
k= 711
1575
case3
k= 711
1576
case2
k= 712
1577
case3
k= 712
1578
case3
k= 713
1579
case3
k= 714
1580
case3
k= 715
1581
case3
k= 716
1582
case2
k= 717
1583
case2
k= 717
1584
case3
k= 717
1585
case3
k= 718
1586
case2
k= 719
1587
case3
k= 719
1588
case2
k= 720
1589
case3
k= 720
1590
case3
k= 721
1591
case3
k= 722
1592
case2
k= 723
1593
case2
k= 723
1594
case2
k= 723
1595
case2
k= 723
1596
case2
k= 723
1597
case3
k= 723
1598
case2
k= 724
1599
case3
k= 724
1600
case3
k= 725
1601
case2
k= 726
1602
case2
k= 726
1603
case2
k= 726
1604
case2
k= 726
1605
case2
k= 726
1606
case2
k= 726
1607
case2
k= 726
1608
case2
k= 726
1609
case3
k= 726
1610
case3
k= 727
1611
case2
k= 728
1612
case2
k= 728
1613
case2
k= 728
1614
case3
k= 728
1615
case3
k= 729
1616
case3
k= 730
1617
case3
k= 731
1618
case3

case2
k= 1001
2216
case2
k= 1001
2217
case3
k= 1001
2218
case3
k= 1002
2219
case3
k= 1003
2220
case3
k= 1004
2221
case2
k= 1005
2222
case3
k= 1005
2223
case2
k= 1006
2224
case2
k= 1006
2225
case3
k= 1006
2226
case2
k= 1007
2227
case2
k= 1007
2228
case2
k= 1007
2229
case2
k= 1007
2230
case3
k= 1007
2231
case3
k= 1008
2232
case2
k= 1009
2233
case2
k= 1009
2234
case2
k= 1009
2235
case2
k= 1009
2236
case2
k= 1009
2237
case2
k= 1009
2238
case2
k= 1009
2239
case2
k= 1009
2240
case2
k= 1009
2241
case2
k= 1009
2242
case2
k= 1009
2243
case3
k= 1009
2244
case2
k= 1010
2245
case3
k= 1010
2246
case3
k= 1011
2247
case3
k= 1012
2248
case2
k= 1013
2249
case3
k= 1013
2250
case3
k= 1014
2251
case3
k= 1015
2252
case3
k= 1016
2253
case3
k= 1017
2254
case2
k= 1018
2255
case3
k= 1018
2256
case3
k= 1019
2257
case2
k= 1020
2258
case3
k= 1020
2259
case3
k= 1021
2260
case2
k= 1022
2261
case2
k= 1022
2262
case3
k= 1022
2263
case3
k= 1023
2264
case3
k= 1024
2265
case3
k= 1025
2266
case3
k= 1026
2267
case3
k= 102

case3
k= 1272
2746
case3
k= 1273
2747
case3
k= 1274
2748
case3
k= 1275
2749
case3
k= 1276
2750
case3
k= 1277
2751
case3
k= 1278
2752
case3
k= 1279
2753
case3
k= 1280
2754
case3
k= 1281
2755
case3
k= 1282
2756
case2
k= 1283
2757
case3
k= 1283
2758
case2
k= 1284
2759
case3
k= 1284
2760
case2
k= 1285
2761
case3
k= 1285
2762
case2
k= 1286
2763
case3
k= 1286
2764
case2
k= 1287
2765
case2
k= 1287
2766
case2
k= 1287
2767
case3
k= 1287
2768
case3
k= 1288
2769
case3
k= 1289
2770
case3
k= 1290
2771
case3
k= 1291
2772
case3
k= 1292
2773
case3
k= 1293
2774
case3
k= 1294
2775
case2
k= 1295
2776
case3
k= 1295
2777
case2
k= 1296
2778
case2
k= 1296
2779
case2
k= 1296
2780
case2
k= 1296
2781
case2
k= 1296
2782
case2
k= 1296
2783
case2
k= 1296
2784
case2
k= 1296
2785
case3
k= 1296
2786
case3
k= 1297
2787
case3
k= 1298
2788
case2
k= 1299
2789
case2
k= 1299
2790
case2
k= 1299
2791
case3
k= 1299
2792
case2
k= 1300
2793
case3
k= 1300
2794
case3
k= 1301
2795
case2
k= 1302
2796
case3
k= 1302
2797
case2
k= 130

case2
k= 1567
3267
case3
k= 1567
3268
case3
k= 1568
3269
case3
k= 1569
3270
case3
k= 1570
3271
case3
k= 1571
3272
case3
k= 1572
3273
case3
k= 1573
3274
case2
k= 1574
3275
case3
k= 1574
3276
case2
k= 1575
3277
case2
k= 1575
3278
case2
k= 1575
3279
case3
k= 1575
3280
case2
k= 1576
3281
case3
k= 1576
3282
case3
k= 1577
3283
case3
k= 1578
3284
case3
k= 1579
3285
case3
k= 1580
3286
case2
k= 1581
3287
case2
k= 1581
3288
case2
k= 1581
3289
case2
k= 1581
3290
case2
k= 1581
3291
case3
k= 1581
3292
case3
k= 1582
3293
case2
k= 1583
3294
case2
k= 1583
3295
case2
k= 1583
3296
case2
k= 1583
3297
case3
k= 1583
3298
case2
k= 1584
3299
case3
k= 1584
3300
case3
k= 1585
3301
case2
k= 1586
3302
case2
k= 1586
3303
case3
k= 1586
3304
case3
k= 1587
3305
case3
k= 1588
3306
case2
k= 1589
3307
case3
k= 1589
3308
case2
k= 1590
3309
case3
k= 1590
3310
case3
k= 1591
3311
case2
k= 1592
3312
case2
k= 1592
3313
case2
k= 1592
3314
case2
k= 1592
3315
case3
k= 1592
3316
case2
k= 1593
3317
case2
k= 1593
3318
case2
k= 159

3772
case3
k= 1794
3773
case2
k= 1795
3774
case3
k= 1795
3775
case2
k= 1796
3776
case2
k= 1796
3777
case2
k= 1796
3778
case3
k= 1796
3779
case3
k= 1797
3780
case3
k= 1798
3781
case3
k= 1799
3782
case3
k= 1800
3783
case3
k= 1801
3784
case3
k= 1802
3785
case3
k= 1803
3786
case3
k= 1804
3787
case3
k= 1805
3788
case2
k= 1806
3789
case3
k= 1806
3790
case3
k= 1807
3791
case3
k= 1808
3792
case3
k= 1809
3793
case3
k= 1810
3794
case3
k= 1811
3795
case2
k= 1812
3796
case3
k= 1812
3797
case2
k= 1813
3798
case3
k= 1813
3799
case3
k= 1814
3800
case2
k= 1815
3801
case2
k= 1815
3802
case2
k= 1815
3803
case3
k= 1815
3804
case3
k= 1816
3805
case3
k= 1817
3806
case3
k= 1818
3807
case3
k= 1819
3808
case2
k= 1820
3809
case2
k= 1820
3810
case3
k= 1820
3811
case3
k= 1821
3812
case3
k= 1822
3813
case3
k= 1823
3814
case2
k= 1824
3815
case2
k= 1824
3816
case2
k= 1824
3817
case2
k= 1824
3818
case2
k= 1824
3819
case2
k= 1824
3820
case3
k= 1824
3821
case3
k= 1825
3822
case3
k= 1826
3823
case2
k= 1827
3824
case2
k

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,12,0,2017,"45,000 mi.",Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,1.0,Yes,22500
1,4,1,2013,"67,874 mi.",Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,0.0,Yes,18000
2,4,2,2011,"132,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,0.0,Yes,11000
3,4,2,2009,"72,900 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,0.0,Yes,15500
4,4,2,2008,"87,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,0.0,Yes,15300


In [44]:
# str -> int 로 단위 바꾸기
data['model'] = data['model'].astype('int64')
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,12,0,2017,"45,000 mi.",Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,1.0,Yes,22500
1,4,1,2013,"67,874 mi.",Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,0.0,Yes,18000
2,4,2,2011,"132,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,0.0,Yes,11000
3,4,2,2009,"72,900 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,0.0,Yes,15500
4,4,2,2008,"87,000 mi.",Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,0.0,Yes,15300


In [45]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   int64  
 1   model         3896 non-null   int64  
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   object 
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(4), object(7)
memory usage: 365.4+ KB


**milage 가변수화**

In [46]:
# , 지우기
for index, row in data.iterrows():
    data.loc[index, 'milage'] = data.loc[index, 'milage'].replace(',', '')
    data.loc[index, 'milage'] = data.loc[index, 'milage'].replace('mi.', '')

In [47]:
# str -> int 로 단위 바꾸기
data['milage'] = data['milage'].astype('int64')
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,12,0,2017,45000,Gasoline,164.0HP 1.4L 4 Cylinder Engine Gasoline Fuel,6-Speed M/T,Gray,Black,1.0,Yes,22500
1,4,1,2013,67874,Gasoline,230.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Black,0.0,Yes,18000
2,4,2,2011,132000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,7-Speed A/T,Black,Beige,0.0,Yes,11000
3,4,2,2009,72900,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,6-Speed A/T,Gray,Gray,0.0,Yes,15500
4,4,2,2008,87000,Gasoline,300.0HP 3.0L Straight 6 Cylinder Engine Gasoli...,A/T,White,Beige,0.0,Yes,15300


In [48]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   int64  
 1   model         3896 non-null   int64  
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   int64  
 4   fuel_type     3896 non-null   object 
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(5), object(6)
memory usage: 365.4+ KB


**fuel_type 가변수화**

In [49]:
data = data.sort_values('fuel_type')

data.reset_index(drop = True, inplace = True)
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,36,1583,2021,13344,Diesel,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,56500
1,14,674,2009,130200,Diesel,350.0HP 6.4L 8 Cylinder Engine Diesel Fuel,5-Speed A/T,White,Beige,0.0,Yes,28900
2,14,673,2005,176691,Diesel,325.0HP 6.0L 8 Cylinder Engine Diesel Fuel,A/T,White,Beige,1.0,Yes,13000
3,14,672,2021,28220,Diesel,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,73600
4,14,672,2018,66281,Diesel,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,63500


In [50]:
k = 0
for i in range(0, len(data)) :
    print(i)
    if(i + 1 >= len(data)) :
        data.iloc[i, 4] = k
        break
    
    else :
        if(data.iloc[i, 4] == data.iloc[i + 1, 4]) :
            print('case2')
            print('k=', k)
            data.iloc[i, 4] = k
        
        else :
            print('case3')
            print('k=', k)
            data.iloc[i, 4] = k
            k = k + 1

data.head()

0
case2
k= 0
1
case2
k= 0
2
case2
k= 0
3
case2
k= 0
4
case2
k= 0
5
case2
k= 0
6
case2
k= 0
7
case2
k= 0
8
case2
k= 0
9
case2
k= 0
10
case2
k= 0
11
case2
k= 0
12
case2
k= 0
13
case2
k= 0
14
case2
k= 0
15
case2
k= 0
16
case2
k= 0
17
case2
k= 0
18
case2
k= 0
19
case2
k= 0
20
case2
k= 0
21
case2
k= 0
22
case2
k= 0
23
case2
k= 0
24
case2
k= 0
25
case2
k= 0
26
case2
k= 0
27
case2
k= 0
28
case2
k= 0
29
case2
k= 0
30
case2
k= 0
31
case2
k= 0
32
case2
k= 0
33
case2
k= 0
34
case2
k= 0
35
case2
k= 0
36
case2
k= 0
37
case2
k= 0
38
case2
k= 0
39
case2
k= 0
40
case2
k= 0
41
case2
k= 0
42
case2
k= 0
43
case2
k= 0
44
case2
k= 0
45
case2
k= 0
46
case2
k= 0
47
case2
k= 0
48
case2
k= 0
49
case2
k= 0
50
case2
k= 0
51
case2
k= 0
52
case2
k= 0
53
case2
k= 0
54
case2
k= 0
55
case2
k= 0
56
case2
k= 0
57
case2
k= 0
58
case2
k= 0
59
case2
k= 0
60
case2
k= 0
61
case2
k= 0
62
case2
k= 0
63
case2
k= 0
64
case2
k= 0
65
case2
k= 0
66
case2
k= 0
67
case2
k= 0
68
case2
k= 0
69
case2
k= 0
70
case2
k= 0
71
case2
k= 0
72

652
case2
k= 2
653
case2
k= 2
654
case2
k= 2
655
case2
k= 2
656
case2
k= 2
657
case2
k= 2
658
case2
k= 2
659
case2
k= 2
660
case2
k= 2
661
case2
k= 2
662
case2
k= 2
663
case2
k= 2
664
case2
k= 2
665
case2
k= 2
666
case2
k= 2
667
case2
k= 2
668
case2
k= 2
669
case2
k= 2
670
case2
k= 2
671
case2
k= 2
672
case2
k= 2
673
case2
k= 2
674
case2
k= 2
675
case2
k= 2
676
case2
k= 2
677
case2
k= 2
678
case2
k= 2
679
case2
k= 2
680
case2
k= 2
681
case2
k= 2
682
case2
k= 2
683
case2
k= 2
684
case2
k= 2
685
case2
k= 2
686
case2
k= 2
687
case2
k= 2
688
case2
k= 2
689
case2
k= 2
690
case2
k= 2
691
case2
k= 2
692
case2
k= 2
693
case2
k= 2
694
case2
k= 2
695
case2
k= 2
696
case2
k= 2
697
case2
k= 2
698
case2
k= 2
699
case2
k= 2
700
case2
k= 2
701
case2
k= 2
702
case2
k= 2
703
case2
k= 2
704
case2
k= 2
705
case2
k= 2
706
case2
k= 2
707
case2
k= 2
708
case2
k= 2
709
case2
k= 2
710
case2
k= 2
711
case2
k= 2
712
case2
k= 2
713
case2
k= 2
714
case2
k= 2
715
case2
k= 2
716
case2
k= 2
717
case2
k= 2
718
case2


1327
case2
k= 2
1328
case2
k= 2
1329
case2
k= 2
1330
case2
k= 2
1331
case2
k= 2
1332
case2
k= 2
1333
case2
k= 2
1334
case2
k= 2
1335
case2
k= 2
1336
case2
k= 2
1337
case2
k= 2
1338
case2
k= 2
1339
case2
k= 2
1340
case2
k= 2
1341
case2
k= 2
1342
case2
k= 2
1343
case2
k= 2
1344
case2
k= 2
1345
case2
k= 2
1346
case2
k= 2
1347
case2
k= 2
1348
case2
k= 2
1349
case2
k= 2
1350
case2
k= 2
1351
case2
k= 2
1352
case2
k= 2
1353
case2
k= 2
1354
case2
k= 2
1355
case2
k= 2
1356
case2
k= 2
1357
case2
k= 2
1358
case2
k= 2
1359
case2
k= 2
1360
case2
k= 2
1361
case2
k= 2
1362
case2
k= 2
1363
case2
k= 2
1364
case2
k= 2
1365
case2
k= 2
1366
case2
k= 2
1367
case2
k= 2
1368
case2
k= 2
1369
case2
k= 2
1370
case2
k= 2
1371
case2
k= 2
1372
case2
k= 2
1373
case2
k= 2
1374
case2
k= 2
1375
case2
k= 2
1376
case2
k= 2
1377
case2
k= 2
1378
case2
k= 2
1379
case2
k= 2
1380
case2
k= 2
1381
case2
k= 2
1382
case2
k= 2
1383
case2
k= 2
1384
case2
k= 2
1385
case2
k= 2
1386
case2
k= 2
1387
case2
k= 2
1388
case2
k= 2
1389
cas

1918
case2
k= 2
1919
case2
k= 2
1920
case2
k= 2
1921
case2
k= 2
1922
case2
k= 2
1923
case2
k= 2
1924
case2
k= 2
1925
case2
k= 2
1926
case2
k= 2
1927
case2
k= 2
1928
case2
k= 2
1929
case2
k= 2
1930
case2
k= 2
1931
case2
k= 2
1932
case2
k= 2
1933
case2
k= 2
1934
case2
k= 2
1935
case2
k= 2
1936
case2
k= 2
1937
case2
k= 2
1938
case2
k= 2
1939
case2
k= 2
1940
case2
k= 2
1941
case2
k= 2
1942
case2
k= 2
1943
case2
k= 2
1944
case2
k= 2
1945
case2
k= 2
1946
case2
k= 2
1947
case2
k= 2
1948
case2
k= 2
1949
case2
k= 2
1950
case2
k= 2
1951
case2
k= 2
1952
case2
k= 2
1953
case2
k= 2
1954
case2
k= 2
1955
case2
k= 2
1956
case2
k= 2
1957
case2
k= 2
1958
case2
k= 2
1959
case2
k= 2
1960
case2
k= 2
1961
case2
k= 2
1962
case2
k= 2
1963
case2
k= 2
1964
case2
k= 2
1965
case2
k= 2
1966
case2
k= 2
1967
case2
k= 2
1968
case2
k= 2
1969
case2
k= 2
1970
case2
k= 2
1971
case2
k= 2
1972
case2
k= 2
1973
case2
k= 2
1974
case2
k= 2
1975
case2
k= 2
1976
case2
k= 2
1977
case2
k= 2
1978
case2
k= 2
1979
case2
k= 2
1980
cas

2482
case2
k= 2
2483
case2
k= 2
2484
case2
k= 2
2485
case2
k= 2
2486
case2
k= 2
2487
case2
k= 2
2488
case2
k= 2
2489
case2
k= 2
2490
case2
k= 2
2491
case2
k= 2
2492
case2
k= 2
2493
case2
k= 2
2494
case2
k= 2
2495
case2
k= 2
2496
case2
k= 2
2497
case2
k= 2
2498
case2
k= 2
2499
case2
k= 2
2500
case2
k= 2
2501
case2
k= 2
2502
case2
k= 2
2503
case2
k= 2
2504
case2
k= 2
2505
case2
k= 2
2506
case2
k= 2
2507
case2
k= 2
2508
case2
k= 2
2509
case2
k= 2
2510
case2
k= 2
2511
case2
k= 2
2512
case2
k= 2
2513
case2
k= 2
2514
case2
k= 2
2515
case2
k= 2
2516
case2
k= 2
2517
case2
k= 2
2518
case2
k= 2
2519
case2
k= 2
2520
case2
k= 2
2521
case2
k= 2
2522
case2
k= 2
2523
case2
k= 2
2524
case2
k= 2
2525
case2
k= 2
2526
case2
k= 2
2527
case2
k= 2
2528
case2
k= 2
2529
case2
k= 2
2530
case2
k= 2
2531
case2
k= 2
2532
case2
k= 2
2533
case2
k= 2
2534
case2
k= 2
2535
case2
k= 2
2536
case2
k= 2
2537
case2
k= 2
2538
case2
k= 2
2539
case2
k= 2
2540
case2
k= 2
2541
case2
k= 2
2542
case2
k= 2
2543
case2
k= 2
2544
cas

3129
case2
k= 2
3130
case2
k= 2
3131
case2
k= 2
3132
case2
k= 2
3133
case2
k= 2
3134
case2
k= 2
3135
case2
k= 2
3136
case2
k= 2
3137
case2
k= 2
3138
case2
k= 2
3139
case2
k= 2
3140
case2
k= 2
3141
case2
k= 2
3142
case2
k= 2
3143
case2
k= 2
3144
case2
k= 2
3145
case2
k= 2
3146
case2
k= 2
3147
case2
k= 2
3148
case2
k= 2
3149
case2
k= 2
3150
case2
k= 2
3151
case2
k= 2
3152
case2
k= 2
3153
case2
k= 2
3154
case2
k= 2
3155
case2
k= 2
3156
case2
k= 2
3157
case2
k= 2
3158
case2
k= 2
3159
case2
k= 2
3160
case2
k= 2
3161
case2
k= 2
3162
case2
k= 2
3163
case2
k= 2
3164
case2
k= 2
3165
case2
k= 2
3166
case2
k= 2
3167
case2
k= 2
3168
case2
k= 2
3169
case2
k= 2
3170
case2
k= 2
3171
case2
k= 2
3172
case2
k= 2
3173
case2
k= 2
3174
case2
k= 2
3175
case2
k= 2
3176
case2
k= 2
3177
case2
k= 2
3178
case2
k= 2
3179
case2
k= 2
3180
case2
k= 2
3181
case2
k= 2
3182
case2
k= 2
3183
case2
k= 2
3184
case2
k= 2
3185
case2
k= 2
3186
case2
k= 2
3187
case2
k= 2
3188
case2
k= 2
3189
case2
k= 2
3190
case2
k= 2
3191
cas

case2
k= 3
3774
case2
k= 3
3775
case2
k= 3
3776
case2
k= 3
3777
case2
k= 3
3778
case2
k= 3
3779
case2
k= 3
3780
case2
k= 3
3781
case2
k= 3
3782
case2
k= 3
3783
case2
k= 3
3784
case2
k= 3
3785
case2
k= 3
3786
case2
k= 3
3787
case2
k= 3
3788
case2
k= 3
3789
case2
k= 3
3790
case2
k= 3
3791
case2
k= 3
3792
case2
k= 3
3793
case2
k= 3
3794
case2
k= 3
3795
case2
k= 3
3796
case2
k= 3
3797
case2
k= 3
3798
case2
k= 3
3799
case2
k= 3
3800
case2
k= 3
3801
case2
k= 3
3802
case2
k= 3
3803
case2
k= 3
3804
case2
k= 3
3805
case2
k= 3
3806
case2
k= 3
3807
case2
k= 3
3808
case2
k= 3
3809
case2
k= 3
3810
case2
k= 3
3811
case2
k= 3
3812
case2
k= 3
3813
case2
k= 3
3814
case2
k= 3
3815
case2
k= 3
3816
case2
k= 3
3817
case2
k= 3
3818
case2
k= 3
3819
case2
k= 3
3820
case2
k= 3
3821
case2
k= 3
3822
case2
k= 3
3823
case2
k= 3
3824
case2
k= 3
3825
case2
k= 3
3826
case2
k= 3
3827
case2
k= 3
3828
case2
k= 3
3829
case2
k= 3
3830
case2
k= 3
3831
case2
k= 3
3832
case2
k= 3
3833
case2
k= 3
3834
case2
k= 3
3835
case2
k=

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,36,1583,2021,13344,0,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,56500
1,14,674,2009,130200,0,350.0HP 6.4L 8 Cylinder Engine Diesel Fuel,5-Speed A/T,White,Beige,0.0,Yes,28900
2,14,673,2005,176691,0,325.0HP 6.0L 8 Cylinder Engine Diesel Fuel,A/T,White,Beige,1.0,Yes,13000
3,14,672,2021,28220,0,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,73600
4,14,672,2018,66281,0,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,63500


In [51]:
data['fuel_type'].value_counts()

2    3418
3     194
1     136
0     110
4      36
5       2
Name: fuel_type, dtype: int64

In [52]:
# str -> int 로 단위 바꾸기
data['fuel_type'] = data['fuel_type'].astype('int64')
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,36,1583,2021,13344,0,188.0HP 3.0L V6 Cylinder Engine Diesel Fuel,7-Speed A/T,Red,Black,0.0,Yes,56500
1,14,674,2009,130200,0,350.0HP 6.4L 8 Cylinder Engine Diesel Fuel,5-Speed A/T,White,Beige,0.0,Yes,28900
2,14,673,2005,176691,0,325.0HP 6.0L 8 Cylinder Engine Diesel Fuel,A/T,White,Beige,1.0,Yes,13000
3,14,672,2021,28220,0,475.0HP 6.7L 8 Cylinder Engine Diesel Fuel,10-Speed A/T,Silver,Black,0.0,Yes,73600
4,14,672,2018,66281,0,450.0HP 6.7L 8 Cylinder Engine Diesel Fuel,Transmission w/Dual Shift Mode,Gray,Black,0.0,Yes,63500


In [53]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   int64  
 1   model         3896 non-null   int64  
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   int64  
 4   fuel_type     3896 non-null   int64  
 5   engine        3896 non-null   object 
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(6), object(5)
memory usage: 365.4+ KB


**engine 가변수화**

In [54]:
data = data.sort_values('engine')

data.reset_index(drop = True, inplace = True)
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,9,1677,2021,50648,2,1.2L I3 12V GDI DOHC Turbo,Automatic CVT,Pacific Blue Metallic,Jet Black,0.0,No,23995
1,9,1676,2021,24965,2,1.2L I3 12V GDI DOHC Turbo,Automatic CVT,Mosaic Black Metallic,Jet Black,1.0,Yes,22995
2,7,580,2022,18185,2,1.3L I3 12V GDI DOHC Turbo,Automatic CVT,Summit White,Whisper Beige,0.0,No,31755
3,9,1678,2022,42479,2,1.3L I3 12V GDI DOHC Turbo,9-Speed Automatic,Mosaic Black Metallic,Jet Black,0.0,Yes,28495
4,9,1676,2021,20497,2,1.3L I3 12V GDI DOHC Turbo,9-Speed Automatic,Pacific Blue Metallic,Black,0.0,No,22999


In [55]:
k = 0
for i in range(0, len(data)) :
    print(i)
    if(i + 1 >= len(data)) :
        data.iloc[i, 5] = k
        break
    
    else :
        if(data.iloc[i, 5] == data.iloc[i + 1, 5]) :
            print('case2')
            print('k=', k)
            data.iloc[i, 5] = k
        
        else :
            print('case3')
            print('k=', k)
            data.iloc[i, 5] = k
            k = k + 1

data.head()

0
case2
k= 0
1
case3
k= 0
2
case2
k= 1
3
case2
k= 1
4
case3
k= 1
5
case3
k= 2
6
case2
k= 3
7
case3
k= 3
8
case3
k= 4
9
case2
k= 5
10
case2
k= 5
11
case3
k= 5
12
case2
k= 6
13
case2
k= 6
14
case2
k= 6
15
case2
k= 6
16
case3
k= 6
17
case2
k= 7
18
case2
k= 7
19
case2
k= 7
20
case2
k= 7
21
case2
k= 7
22
case3
k= 7
23
case3
k= 8
24
case3
k= 9
25
case2
k= 10
26
case3
k= 10
27
case2
k= 11
28
case2
k= 11
29
case2
k= 11
30
case3
k= 11
31
case3
k= 12
32
case2
k= 13
33
case3
k= 13
34
case3
k= 14
35
case2
k= 15
36
case3
k= 15
37
case2
k= 16
38
case3
k= 16
39
case3
k= 17
40
case2
k= 18
41
case3
k= 18
42
case2
k= 19
43
case3
k= 19
44
case3
k= 20
45
case3
k= 21
46
case3
k= 22
47
case3
k= 23
48
case3
k= 24
49
case3
k= 25
50
case3
k= 26
51
case3
k= 27
52
case2
k= 28
53
case2
k= 28
54
case2
k= 28
55
case3
k= 28
56
case3
k= 29
57
case3
k= 30
58
case2
k= 31
59
case3
k= 31
60
case3
k= 32
61
case2
k= 33
62
case3
k= 33
63
case3
k= 34
64
case2
k= 35
65
case2
k= 35
66
case3
k= 35
67
case3
k= 36
68
case2
k= 37


653
case3
k= 236
654
case2
k= 237
655
case3
k= 237
656
case2
k= 238
657
case2
k= 238
658
case2
k= 238
659
case2
k= 238
660
case2
k= 238
661
case2
k= 238
662
case3
k= 238
663
case3
k= 239
664
case2
k= 240
665
case2
k= 240
666
case2
k= 240
667
case3
k= 240
668
case3
k= 241
669
case2
k= 242
670
case2
k= 242
671
case3
k= 242
672
case3
k= 243
673
case2
k= 244
674
case2
k= 244
675
case2
k= 244
676
case2
k= 244
677
case2
k= 244
678
case2
k= 244
679
case2
k= 244
680
case3
k= 244
681
case2
k= 245
682
case3
k= 245
683
case3
k= 246
684
case2
k= 247
685
case2
k= 247
686
case3
k= 247
687
case2
k= 248
688
case2
k= 248
689
case3
k= 248
690
case2
k= 249
691
case2
k= 249
692
case2
k= 249
693
case2
k= 249
694
case2
k= 249
695
case2
k= 249
696
case2
k= 249
697
case3
k= 249
698
case3
k= 250
699
case2
k= 251
700
case2
k= 251
701
case2
k= 251
702
case2
k= 251
703
case2
k= 251
704
case3
k= 251
705
case3
k= 252
706
case3
k= 253
707
case2
k= 254
708
case3
k= 254
709
case2
k= 255
710
case3
k= 255
711
case2
k= 2

1326
case2
k= 447
1327
case2
k= 447
1328
case2
k= 447
1329
case2
k= 447
1330
case3
k= 447
1331
case3
k= 448
1332
case3
k= 449
1333
case2
k= 450
1334
case2
k= 450
1335
case3
k= 450
1336
case2
k= 451
1337
case2
k= 451
1338
case2
k= 451
1339
case2
k= 451
1340
case2
k= 451
1341
case2
k= 451
1342
case2
k= 451
1343
case2
k= 451
1344
case2
k= 451
1345
case3
k= 451
1346
case2
k= 452
1347
case2
k= 452
1348
case2
k= 452
1349
case2
k= 452
1350
case2
k= 452
1351
case2
k= 452
1352
case2
k= 452
1353
case3
k= 452
1354
case2
k= 453
1355
case3
k= 453
1356
case3
k= 454
1357
case3
k= 455
1358
case2
k= 456
1359
case3
k= 456
1360
case3
k= 457
1361
case3
k= 458
1362
case2
k= 459
1363
case2
k= 459
1364
case2
k= 459
1365
case3
k= 459
1366
case2
k= 460
1367
case2
k= 460
1368
case2
k= 460
1369
case2
k= 460
1370
case2
k= 460
1371
case3
k= 460
1372
case2
k= 461
1373
case3
k= 461
1374
case2
k= 462
1375
case2
k= 462
1376
case3
k= 462
1377
case2
k= 463
1378
case2
k= 463
1379
case2
k= 463
1380
case2
k= 463
1381
case2

1936
case2
k= 592
1937
case2
k= 592
1938
case3
k= 592
1939
case3
k= 593
1940
case3
k= 594
1941
case2
k= 595
1942
case3
k= 595
1943
case3
k= 596
1944
case3
k= 597
1945
case2
k= 598
1946
case2
k= 598
1947
case2
k= 598
1948
case2
k= 598
1949
case2
k= 598
1950
case2
k= 598
1951
case2
k= 598
1952
case2
k= 598
1953
case3
k= 598
1954
case2
k= 599
1955
case2
k= 599
1956
case2
k= 599
1957
case3
k= 599
1958
case3
k= 600
1959
case2
k= 601
1960
case2
k= 601
1961
case3
k= 601
1962
case3
k= 602
1963
case2
k= 603
1964
case2
k= 603
1965
case2
k= 603
1966
case2
k= 603
1967
case2
k= 603
1968
case2
k= 603
1969
case3
k= 603
1970
case3
k= 604
1971
case3
k= 605
1972
case3
k= 606
1973
case3
k= 607
1974
case3
k= 608
1975
case2
k= 609
1976
case2
k= 609
1977
case2
k= 609
1978
case3
k= 609
1979
case2
k= 610
1980
case2
k= 610
1981
case2
k= 610
1982
case2
k= 610
1983
case2
k= 610
1984
case2
k= 610
1985
case2
k= 610
1986
case2
k= 610
1987
case2
k= 610
1988
case2
k= 610
1989
case2
k= 610
1990
case3
k= 610
1991
case2

2465
case3
k= 726
2466
case2
k= 727
2467
case2
k= 727
2468
case3
k= 727
2469
case2
k= 728
2470
case3
k= 728
2471
case3
k= 729
2472
case2
k= 730
2473
case3
k= 730
2474
case2
k= 731
2475
case2
k= 731
2476
case2
k= 731
2477
case2
k= 731
2478
case3
k= 731
2479
case2
k= 732
2480
case2
k= 732
2481
case2
k= 732
2482
case3
k= 732
2483
case2
k= 733
2484
case2
k= 733
2485
case2
k= 733
2486
case3
k= 733
2487
case3
k= 734
2488
case2
k= 735
2489
case3
k= 735
2490
case2
k= 736
2491
case2
k= 736
2492
case2
k= 736
2493
case2
k= 736
2494
case2
k= 736
2495
case2
k= 736
2496
case2
k= 736
2497
case2
k= 736
2498
case2
k= 736
2499
case2
k= 736
2500
case2
k= 736
2501
case2
k= 736
2502
case2
k= 736
2503
case3
k= 736
2504
case2
k= 737
2505
case3
k= 737
2506
case3
k= 738
2507
case3
k= 739
2508
case3
k= 740
2509
case2
k= 741
2510
case3
k= 741
2511
case3
k= 742
2512
case2
k= 743
2513
case2
k= 743
2514
case2
k= 743
2515
case2
k= 743
2516
case2
k= 743
2517
case2
k= 743
2518
case2
k= 743
2519
case3
k= 743
2520
case2

3089
case2
k= 876
3090
case2
k= 876
3091
case2
k= 876
3092
case3
k= 876
3093
case2
k= 877
3094
case2
k= 877
3095
case2
k= 877
3096
case2
k= 877
3097
case2
k= 877
3098
case2
k= 877
3099
case2
k= 877
3100
case2
k= 877
3101
case2
k= 877
3102
case2
k= 877
3103
case2
k= 877
3104
case2
k= 877
3105
case2
k= 877
3106
case2
k= 877
3107
case2
k= 877
3108
case3
k= 877
3109
case2
k= 878
3110
case2
k= 878
3111
case3
k= 878
3112
case2
k= 879
3113
case2
k= 879
3114
case2
k= 879
3115
case2
k= 879
3116
case3
k= 879
3117
case2
k= 880
3118
case2
k= 880
3119
case2
k= 880
3120
case3
k= 880
3121
case2
k= 881
3122
case2
k= 881
3123
case2
k= 881
3124
case2
k= 881
3125
case3
k= 881
3126
case2
k= 882
3127
case2
k= 882
3128
case3
k= 882
3129
case3
k= 883
3130
case3
k= 884
3131
case3
k= 885
3132
case2
k= 886
3133
case2
k= 886
3134
case2
k= 886
3135
case2
k= 886
3136
case2
k= 886
3137
case3
k= 886
3138
case2
k= 887
3139
case3
k= 887
3140
case2
k= 888
3141
case2
k= 888
3142
case3
k= 888
3143
case2
k= 889
3144
case2

3714
case2
k= 1068
3715
case2
k= 1068
3716
case2
k= 1068
3717
case2
k= 1068
3718
case2
k= 1068
3719
case2
k= 1068
3720
case3
k= 1068
3721
case2
k= 1069
3722
case2
k= 1069
3723
case2
k= 1069
3724
case3
k= 1069
3725
case3
k= 1070
3726
case3
k= 1071
3727
case3
k= 1072
3728
case2
k= 1073
3729
case2
k= 1073
3730
case3
k= 1073
3731
case2
k= 1074
3732
case3
k= 1074
3733
case2
k= 1075
3734
case3
k= 1075
3735
case3
k= 1076
3736
case3
k= 1077
3737
case2
k= 1078
3738
case3
k= 1078
3739
case3
k= 1079
3740
case3
k= 1080
3741
case2
k= 1081
3742
case2
k= 1081
3743
case2
k= 1081
3744
case3
k= 1081
3745
case3
k= 1082
3746
case2
k= 1083
3747
case3
k= 1083
3748
case3
k= 1084
3749
case3
k= 1085
3750
case3
k= 1086
3751
case2
k= 1087
3752
case3
k= 1087
3753
case3
k= 1088
3754
case3
k= 1089
3755
case2
k= 1090
3756
case2
k= 1090
3757
case2
k= 1090
3758
case2
k= 1090
3759
case2
k= 1090
3760
case3
k= 1090
3761
case3
k= 1091
3762
case3
k= 1092
3763
case3
k= 1093
3764
case2
k= 1094
3765
case2
k= 1094
3766
case2
k

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,9,1677,2021,50648,2,0,Automatic CVT,Pacific Blue Metallic,Jet Black,0.0,No,23995
1,9,1676,2021,24965,2,0,Automatic CVT,Mosaic Black Metallic,Jet Black,1.0,Yes,22995
2,7,580,2022,18185,2,1,Automatic CVT,Summit White,Whisper Beige,0.0,No,31755
3,9,1678,2022,42479,2,1,9-Speed Automatic,Mosaic Black Metallic,Jet Black,0.0,Yes,28495
4,9,1676,2021,20497,2,1,9-Speed Automatic,Pacific Blue Metallic,Black,0.0,No,22999


In [56]:
# str -> int 로 단위 바꾸기
data['engine'] = data['engine'].astype('int64')
data.head()

Unnamed: 0,brand,model,model_year,milage,fuel_type,engine,transmission,ext_col,int_col,accident,clean_title,price
0,9,1677,2021,50648,2,0,Automatic CVT,Pacific Blue Metallic,Jet Black,0.0,No,23995
1,9,1676,2021,24965,2,0,Automatic CVT,Mosaic Black Metallic,Jet Black,1.0,Yes,22995
2,7,580,2022,18185,2,1,Automatic CVT,Summit White,Whisper Beige,0.0,No,31755
3,9,1678,2022,42479,2,1,9-Speed Automatic,Mosaic Black Metallic,Jet Black,0.0,Yes,28495
4,9,1676,2021,20497,2,1,9-Speed Automatic,Pacific Blue Metallic,Black,0.0,No,22999


In [57]:
data['engine'].value_counts()

198     52
724     48
871     46
1129    44
325     42
        ..
391      1
766      1
389      1
387      1
1128     1
Name: engine, Length: 1130, dtype: int64

In [58]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3896 entries, 0 to 3895
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   brand         3896 non-null   int64  
 1   model         3896 non-null   int64  
 2   model_year    3896 non-null   int64  
 3   milage        3896 non-null   int64  
 4   fuel_type     3896 non-null   int64  
 5   engine        3896 non-null   int64  
 6   transmission  3896 non-null   object 
 7   ext_col       3896 non-null   object 
 8   int_col       3896 non-null   object 
 9   accident      3896 non-null   float64
 10  clean_title   3896 non-null   object 
 11  price         3896 non-null   int64  
dtypes: float64(1), int64(7), object(4)
memory usage: 365.4+ KB
