In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('train22.csv')
test = pd.read_csv('test22.csv')
sub = pd.read_csv('sample_submission.csv')

In [3]:
%%time
int_replacements = {
    'Medium Earth Gray': 'Gray',
    'Diesel Gray / Black': 'Gray',
    'Dark Ash': 'Gray',
    'Graphite': 'Gray',
    'Dark Galvanized': 'Charcoal',
    'Light Gray': 'Gray',
    'Ash': 'Gray',
    'Jet Black': 'Black',
    'Global Black': 'Black',
    'Black Onyx': 'Black',
    'Parchment.': 'Beige',
    'Sardar Brown': 'Brown',
    'Black/Gun Metal': 'Black',
    'Charcoal Black': 'Charcoal',
    'Ebony': 'Brown',
    'Ebony Black': 'Black',
    'Carbon Black': 'Black',
    'Obsidian Black': 'Black',
    'Black / Saddle Brown': 'Black',
    'Black/Saddle Brown': 'Black',
    'Black / Brown': 'Black',
    'Saddle Brown': 'Brown',
    'Sand Beige': 'Beige',
    'Camel': 'Beige',
    'Parchment': 'Beige',
    'Macchiato Beige/Black': 'Beige',
    'Silk Beige/Espresso Brown': 'Beige',
    'Canberra Beige': 'Beige',
    'Macchiato': 'Beige',
    'Almond Beige': 'Beige',
    'Grace White': 'White',
    'Ivory / Ebony': 'White',
    'Bianco Polar': 'White',
    'White / Brown': 'White',
    'Platinum': 'White',
    'Cloud': 'Blue',
    'Rift Metallic': 'White',
    'Light Platinum / Jet Black': 'Silver',
    'Billet Clearcoat Metallic': 'Silver',
    'Sakhir Orange': 'Orange',
    'Pimento Red w/Ebony': 'Red',
    'Adrenaline Red': 'Red',
    'Rioja Red': 'Red',
    'Classic Red': 'Red',
    'Magma Red': 'Red',
    'Cobalt Blue': 'Blue',
    'Tempest': 'Blue',
    'Stormy Sea': 'Blue',
    'Navy Pier': 'Blue',
    'Charles Blue': 'Blue',
    'Rhapsody Blue': 'Blue',
    'Kyalami Orange': 'Orange',
    'Sakhir Orange/Black': 'Orange',
    'Dark Gray': 'Gray',
    'Deep Garnet': 'Red',
    'Scarlet Ember': 'Red',
    'Beluga': 'Blue',
    'Chestnut': 'Brown',
    'Boulder': 'Gray',
    'Macchiato/Magmagrey': 'Beige',
    'Medium Stone': 'Gray',
    'BLACK': 'Black',
    'Portland': 'Gray',
    'Sandstone': 'Beige',
    'Slate': 'Gray',
    'Cappuccino': 'Brown',
    'Oyster W/Contrast': 'Beige',
    'Nero Ade': 'Black',
    'Light Titanium': 'Silver',
    'Tan': 'Beige',
    'Brandy': 'Brown',
    'Amber': 'Yellow',
    'Hotspur': 'Blue',
    'Chateau': 'Green',
    'Ice': 'Blue',
    'Blk': 'Black',
    'Mesa': 'Brown',
    'Espresso': 'Brown',
    'Ceramic': 'White',
    'Medium Dark Slate': 'Gray',
    'Graphite w/Gun Metal': 'Gray',
    'Cocoa / Dune': 'Brown',
    'Roast': 'Brown',
    'Hotspur Hide': 'Brown',
    'ORANGE': 'Orange',
    'Walnut': 'Brown',
    'Caramel': 'Beige',
    'Giallo Taurus / Nero Ade': 'Yellow',
    'Medium Pewter': 'Gray',
    'Camel Leather': 'Brown',
    'Anthracite': 'Gray',
    'Mocha': 'Brown',
    'Sahara Tan': 'Beige',
    'Porpoise': 'Beige',
    'Deep Cypress': 'Green',
    'Light Slate': 'Gray',
    'Beluga Hide': 'Black',
    'Tupelo': 'Green',
    'Gideon': 'Beige',
    'Medium Light Camel': 'Beige',
    'Nero': 'Black',
    'Deep Chestnut': 'Red',
    'Dark Auburn': 'Brown',
    'Shale': 'Gray',
    'BEIGE': 'Beige',
    'Linen': 'Beige',
    'WHITE': 'White',
    'Tension': 'Blue',
    'Sport': 'Red',
    'Very Light Cashmere': 'Beige'
}

train['main_int_color'] = train['int_col']
test['main_int_color'] = test['int_col']

train['main_int_color'] = train['main_int_color'].replace(int_replacements)
test['main_int_color'] = test['main_int_color'].replace(int_replacements)

CPU times: total: 750 ms
Wall time: 752 ms


In [4]:
keywords = ['Black', 'Beige', 'Brown', 'Gray', 'Brown', 'White', 'Red', 'Blue', 'Yellow', 'Ebony', 'Green', 'Orange', 'Gold', 'Silver', 'Charcoal']

def assign_main_color(int_col):
    ext_col = int_col.replace('Grey', 'Gray')
    for keyword in keywords:
        if keyword in int_col:
            return keyword
    return 'NN'

train['main_int_color'] = train['main_int_color'].apply(assign_main_color)
test['main_int_color'] = test['main_int_color'].apply(assign_main_color)

In [5]:
%%time
ext_replacements = {
    'Blu': 'Blue',
    'BLUE': 'Blue',
    'Glacier': 'Blue',
    'BLU ELEOS': 'Blue',
    'Dark Sapphire': 'Navy',
    'Tangerine': 'Orange',
    'Pumpkin': 'Orange',
    'Clementine': 'Orange',
    'Granite': 'Gray',
    'Go Mango!': 'Yellow',
    'Onyx': 'Black',
    'Gecko Pearlcoat': 'Green',
    'Obsidian': 'Black',
    'Metallic': 'Silver',
    'Grigio Nimbus': 'Silver',
    'Chalk': 'White',
    'Bianco Monocerus': 'White',
    'Verde': 'Green',
    'Dark Graphite Metallic': 'Gray',
    'BLACK': 'Black',
    'Dark Moss': 'Green',
    'Granite Crystal Clearcoat Metallic': 'Gray',
    'Ebony Twilight Metallic': 'Black',
    'Satin Steel Metallic': 'Silver',
    'Magnetic Metallic': 'Gray',
    'Dark Matter Metallic': 'Gray',
    'Dark Ash Metallic': 'Gray',
    'Iridium Metallic': 'Gray',
    'Nightfall Mica': 'Navy',
    'Sandstone Metallic': 'Beige',
    'Rift Metallic': 'White',
    'Billet Clearcoat Metallic': 'Silver',
    'Tan': 'Beige',
    'Ice': 'Blue',
    'Hellayella': 'Yellow',
    'Granite': 'Gray',
    'Yulong': 'White',
    'Blueprint': 'Navy',
    'Arancio Borealis': 'Orange',
    'Hellayella Clearcoat': 'Yellow',
    'Moonlight Cloud': 'Navy',
    'Liquid Platinum': 'Silver',
    'Gun Metallic': 'Gray',
    'Manhattan Noir Metallic': 'Gray',
    'Lavender': 'Purple',
    'Violet': 'Purple',
    'Pink': 'Purple',
    'Mauve': 'Plum',
    'Tempest': 'Blue',
    'Nero Daytona': 'Black',
    'Scarlet Ember': 'Red',
    'Infrared Tintcoat': 'Red',
    'Maximum Steel Metallic': 'Gray',
    'Ember Pearlcoat': 'Brown',
    'Rich Garnet Metallic': 'Brown',
    'Tungsten Metallic': 'Gray',
    'Nero Noctis': 'Black',
    'Platinum Quartz Metallic': 'White',
    'Ruby Flare Pearl': 'Red',
    'Bianco Icarus Metallic': 'White',
    'Stormy Sea': 'Blue',
    'Mountain Air Metallic': 'Blue',
    'Wind Chill Pearl': 'White',
    'Iridescent Pearl Tricoat': 'White',
    'Black Cherry': 'Plum',
    'Black Forest Green': 'Green',
    'Maroon': 'Red',
    'Rosso': 'Red',
    'Rosso Corsa': 'Red',
    'Rosso Mars Metallic': 'Red',
    'Quicksilver Metallic': 'Silver',
    'Designo Magno Matte': 'Gray',
    'Granite Crystal Metallic Clearcoat': 'Gray',
    'Bianco Isis': 'White',
    'Sunset Drift Chromaflair': 'Orange',
    'Ametrin Metallic': 'Plum',
    'GT SILVER': 'Silver',
    'Caviar': 'Black'
}

train['main_ext_color'] = train['ext_col']
test['main_ext_color'] = test['ext_col']


train['main_ext_color'] = train['main_ext_color'].replace(ext_replacements)
test['main_ext_color'] = test['main_ext_color'].replace(ext_replacements)

CPU times: total: 500 ms
Wall time: 509 ms


In [6]:
keywords = ['Black', 'White', 'Gray', 'Silver', 'Blue', 'Red', 'Green', 'Yellow', 'Brown', 'Beige', 'Bronze', 'Orange', 'Plum', 'Purple', 'Gold', 'Navy']

def assign_main_color(ext_col):
    ext_col = ext_col.replace('Grey', 'Gray')
    for keyword in keywords:
        if keyword in ext_col:
            return keyword
    return 'NN'

train['main_ext_color'] = train['main_ext_color'].apply(assign_main_color)
test['main_ext_color'] = test['main_ext_color'].apply(assign_main_color)

In [7]:
train = train.drop(columns=['int_col'])
test = test.drop(columns=['int_col'])
train = train.drop(columns=['ext_col'])
test = test.drop(columns=['ext_col'])


In [8]:
col = train.columns

for i in col:
    print(f"{i}: {train[i].nunique()}")

id: 182491
brand: 56
model: 1851
model_year: 34
milage: 6513
fuel_type: 7
accident: 2
clean_title: 2
price: 1569
horsepower: 348
displacement: 61
cylinder_count: 7
log_price: 1569
transmission_type: 3
transmission_speed: 10
special_feature: 5
main_int_color: 15
main_ext_color: 17


In [9]:
train = train.drop(columns=['id'])
test = test.drop(columns=['id'])

In [10]:
# 이진 변수 변환
train['accident'] = train['accident'].map({'None reported': 0, 'At least 1 accident or damage reported': 1})
train['clean_title'] = train['clean_title'].map({'No': 0, 'Yes': 1})

test['accident'] = test['accident'].map({'None reported': 0, 'At least 1 accident or damage reported': 1})
test['clean_title'] = test['clean_title'].map({'No': 0, 'Yes': 1})

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 182491 entries, 0 to 182490
Data columns (total 17 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   brand               182491 non-null  object 
 1   model               182491 non-null  object 
 2   model_year          182491 non-null  int64  
 3   milage              182491 non-null  int64  
 4   fuel_type           182491 non-null  object 
 5   accident            182491 non-null  int64  
 6   clean_title         182491 non-null  int64  
 7   price               182491 non-null  float64
 8   horsepower          182491 non-null  float64
 9   displacement        182491 non-null  float64
 10  cylinder_count      182491 non-null  float64
 11  log_price           182491 non-null  float64
 12  transmission_type   182491 non-null  object 
 13  transmission_speed  182491 non-null  int64  
 14  special_feature     182491 non-null  object 
 15  main_int_color      182491 non-nul

In [12]:
train.to_csv('train33.csv', index=False)
test.to_csv('test33.csv', index=False)