In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
# 데이터 로드
df = pd.read_csv('/home/piai/PABA_27/BigData/assignment/Car_pre.csv')
df.head()

Unnamed: 0,Name,Location,Price,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Wagon R LXI CNG,Mumbai,2682.68,2010,72000,CNG,Manual,First,26.6 kmpl,998 CC,58.16 bhp,5.0,
1,Hyundai Creta 1.6 CRDi SX Option,Pune,19162.0,2015,41000,Diesel,Manual,First,19.67 kmpl,1582 CC,126.2 bhp,5.0,
2,Honda Jazz V,Chennai,6898.32,2011,46000,Petrol,Manual,First,18.2 kmpl,1199 CC,88.7 bhp,5.0,8.61 Lakh
3,Maruti Ertiga VDI,Chennai,9197.76,2012,87000,Diesel,Manual,First,20.77 kmpl,1248 CC,88.76 bhp,7.0,
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,27194.71,2013,40670,Diesel,Automatic,Second,15.2 kmpl,1968 CC,140.8 bhp,5.0,


In [4]:
df.isnull().sum()

Name                    0
Location                0
Price                1053
Year                    0
Kilometers_Driven       0
Fuel_Type               0
Transmission            0
Owner_Type              0
Mileage                 2
Engine                 46
Power                  46
Seats                  53
New_Price            6247
dtype: int64

In [5]:
# 단위 분리 및 단위 제거 함수 정의
def remove_unit(value):
    if isinstance(value, str) and len(value.split()) > 1:
        return value.split()[0]
    return value

# 각 열에 대해 함수 적용
df['Mileage'] = df['Mileage'].apply(remove_unit)
df['Engine'] = df['Engine'].apply(remove_unit)
df['Power'] = df['Power'].apply(remove_unit)
df['New_Price'] = df['New_Price'].apply(remove_unit)

# 필요 없는 열 제거 (필요한 열만 남기기)
columns_to_keep = ['Name', 'Location', 'Price', 'Year', 'Kilometers_Driven', 'Fuel_Type',
                   'Transmission', 'Owner_Type', 'Mileage', 'Engine', 'Power', 'Seats', 'New_Price']
df = df[columns_to_keep]
df.head()

Unnamed: 0,Name,Location,Price,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price
0,Maruti Wagon R LXI CNG,Mumbai,2682.68,2010,72000,CNG,Manual,First,26.6,998,58.16,5.0,
1,Hyundai Creta 1.6 CRDi SX Option,Pune,19162.0,2015,41000,Diesel,Manual,First,19.67,1582,126.2,5.0,
2,Honda Jazz V,Chennai,6898.32,2011,46000,Petrol,Manual,First,18.2,1199,88.7,5.0,8.61
3,Maruti Ertiga VDI,Chennai,9197.76,2012,87000,Diesel,Manual,First,20.77,1248,88.76,7.0,
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,27194.71,2013,40670,Diesel,Automatic,Second,15.2,1968,140.8,5.0,


In [6]:
# 새로운 변수 생성
# Brand 및 Model 변수 생성
df['Brand'] = df['Name'].apply(lambda x: x.split()[0])
df['Model'] = df['Name'].apply(lambda x: ' '.join(x.split()[1:]))

# Country 변수 생성
brand_country_mapping = {
    'Maruti': 'India', 'Hyundai': 'South Korea', 'Honda': 'Japan', 'Toyota': 'Japan', 
    'Mercedes-Benz': 'Germany', 'Volkswagen': 'Germany', 'Ford': 'USA', 'Mahindra': 'India', 
    'BMW': 'Germany', 'Audi': 'Germany', 'Tata': 'India', 'Skoda': 'Czech Republic', 
    'Renault': 'France', 'Chevrolet': 'USA', 'Nissan': 'Japan', 'Land': 'UK', 'Jaguar': 'UK', 
    'Fiat': 'Italy', 'Mitsubishi': 'Japan', 'Mini': 'UK', 'Volvo': 'Sweden', 'Porsche': 'Germany', 
    'Jeep': 'USA', 'Datsun': 'Japan', 'ISUZU': 'Japan', 'Force': 'India', 'Bentley': 'UK', 
    'Smart': 'Germany', 'Ambassador': 'India', 'Lamborghini': 'Italy', 'Hindustan': 'India', 
    'OpelCorsa': 'Germany'
}
df['Country'] = df['Brand'].map(brand_country_mapping)

# Age 변수 생성
current_year = 2024
df['Age'] = current_year - df['Year']

In [7]:
df.head(10)

Unnamed: 0,Name,Location,Price,Year,Kilometers_Driven,Fuel_Type,Transmission,Owner_Type,Mileage,Engine,Power,Seats,New_Price,Brand,Model,Country,Age
0,Maruti Wagon R LXI CNG,Mumbai,2682.68,2010,72000,CNG,Manual,First,26.6,998,58.16,5.0,,Maruti,Wagon R LXI CNG,India,14
1,Hyundai Creta 1.6 CRDi SX Option,Pune,19162.0,2015,41000,Diesel,Manual,First,19.67,1582,126.2,5.0,,Hyundai,Creta 1.6 CRDi SX Option,South Korea,9
2,Honda Jazz V,Chennai,6898.32,2011,46000,Petrol,Manual,First,18.2,1199,88.7,5.0,8.61,Honda,Jazz V,Japan,13
3,Maruti Ertiga VDI,Chennai,9197.76,2012,87000,Diesel,Manual,First,20.77,1248,88.76,7.0,,Maruti,Ertiga VDI,India,12
4,Audi A4 New 2.0 TDI Multitronic,Coimbatore,27194.71,2013,40670,Diesel,Automatic,Second,15.2,1968,140.8,5.0,,Audi,A4 New 2.0 TDI Multitronic,Germany,11
5,Hyundai EON LPG Era Plus Option,Hyderabad,3602.46,2012,75000,LPG,Manual,First,21.1,814,55.2,5.0,,Hyundai,EON LPG Era Plus Option,South Korea,12
6,Nissan Micra Diesel XV,Jaipur,5365.36,2013,86999,Diesel,Manual,First,23.08,1461,63.1,5.0,,Nissan,Micra Diesel XV,Japan,11
7,Toyota Innova Crysta 2.8 GX AT 8S,Mumbai,26826.8,2016,36000,Diesel,Automatic,First,11.36,2755,171.5,8.0,21.0,Toyota,Innova Crysta 2.8 GX AT 8S,Japan,8
8,Volkswagen Vento Diesel Comfortline,Pune,7971.39,2013,64430,Diesel,Manual,First,20.54,1598,103.6,5.0,,Volkswagen,Vento Diesel Comfortline,Germany,11
9,Tata Indica Vista Quadrajet LS,Chennai,2989.27,2012,65932,Diesel,Manual,Second,22.3,1248,74.0,5.0,,Tata,Indica Vista Quadrajet LS,India,12


In [8]:
# 중간 저장 
df.to_csv('/home/piai/PABA_27/BigData/assignment/Car_pre_newcol.csv', index=False)

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7253 entries, 0 to 7252
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               7253 non-null   object 
 1   Location           7253 non-null   object 
 2   Price              6200 non-null   float64
 3   Year               7253 non-null   int64  
 4   Kilometers_Driven  7253 non-null   int64  
 5   Fuel_Type          7253 non-null   object 
 6   Transmission       7253 non-null   object 
 7   Owner_Type         7253 non-null   object 
 8   Mileage            7251 non-null   object 
 9   Engine             7207 non-null   object 
 10  Power              7207 non-null   object 
 11  Seats              7200 non-null   float64
 12  New_Price          1006 non-null   object 
 13  Brand              7253 non-null   object 
 14  Model              7253 non-null   object 
 15  Country            7253 non-null   object 
 16  Age                7253 