In [None]:
# -*- coding: utf-8 -*-
"""
对 used_car_train_20200313.csv 进行基础EDA分析。
"""
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# 设置pandas显示所有列
pd.set_option('display.max_columns', None)

# 读取数据，分隔符为空格（支持多个空格）
df = pd.read_csv('used_car_train_20200313.csv', sep='\s+', engine='python') # type: ignore

In [2]:
df['v_12'].isnull().astype(int)

0         0
1         0
2         0
3         0
4         0
         ..
149995    0
149996    0
149997    0
149998    0
149999    0
Name: v_12, Length: 150000, dtype: int64

In [3]:
# 缺失值处理
numerical_features = ['power', 'kilometer', 'v_0', 'v_1', 'v_2', 'v_3', 'v_4', 'v_5', 'v_6', 'v_7', 'v_8', 'v_9', 'v_10', 'v_11', 'v_12', 'v_13', 'v_14']
for feature in numerical_features:
    # 标记缺失值
    df[f'{feature}_missing'] = df[feature].isnull().astype(int)
    # 将列转换为数值类型，无法转换的设置为NaN
    df[feature] = pd.to_numeric(df[feature], errors='coerce')
    # 填充缺失值
    df[feature] = df[feature].fillna(df[feature].median())
df

Unnamed: 0,SaleID,name,regDate,model,brand,bodyType,fuelType,gearbox,power,kilometer,notRepairedDamage,regionCode,seller,offerType,creatDate,price,v_0,v_1,v_2,v_3,v_4,v_5,v_6,v_7,v_8,v_9,v_10,v_11,v_12,v_13,v_14,power_missing,kilometer_missing,v_0_missing,v_1_missing,v_2_missing,v_3_missing,v_4_missing,v_5_missing,v_6_missing,v_7_missing,v_8_missing,v_9_missing,v_10_missing,v_11_missing,v_12_missing,v_13_missing,v_14_missing
0,0,736,20040402,30.0,6.0,1.0,0.0,0.0,60.0,12.5,0.0,1046,0,0.0,20160404.0,1850.0,43.357796,3.966344,0.050257,2.159744,1.143786,0.235676,0.101988,0.129549,0.022816,0.097462,-2.881803,2.804097,-2.420821,0.795292,0.914762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,2262,20030301,40.0,1.0,2.0,0.0,0.0,0.0,15.0,-,4366,0,0.0,20160309.0,3600.0,45.305273,5.236112,0.137925,1.380657,-1.422165,0.264777,0.121004,0.135731,0.026597,0.020582,-4.900482,2.096338,-1.030483,-1.722674,0.245522,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,14874,20040403,115.0,15.0,1.0,0.0,0.0,163.0,12.5,0.0,2806,0,0.0,20160402.0,6222.0,45.978359,4.823792,1.319524,-0.998467,-0.996911,0.251410,0.114912,0.165147,0.062173,0.027075,-4.846749,1.803559,1.565330,-0.832687,-0.229963,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,71865,19960908,109.0,10.0,0.0,0.0,1.0,193.0,15.0,0.0,434,0,0.0,20160312.0,2400.0,45.687478,4.492574,-0.050616,0.883600,-2.228079,0.274293,0.110300,0.121964,0.033395,0.000000,-4.509599,1.285940,-0.501868,-2.438353,-0.478699,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,111080,20120103,110.0,5.0,1.0,0.0,0.0,68.0,5.0,0.0,6977,0,0.0,20160313.0,5200.0,44.383511,2.031433,0.572169,-1.571239,2.246088,0.228036,0.073205,0.091880,0.078819,0.121534,-1.896240,0.910783,0.931110,2.834518,1.923482,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
149995,149995,163978,20000607,121.0,10.0,4.0,0.0,1.0,163.0,15.0,0.0,4576,0,0.0,20160327.0,5900.0,45.316543,-3.139095,-1.269707,-0.736609,-1.505820,0.280264,0.000310,0.048441,0.071158,0.019174,1.988114,-2.983973,0.589167,-1.304370,-0.302592,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
149996,149996,184535,20091102,116.0,11.0,0.0,0.0,0.0,125.0,10.0,0.0,2826,0,0.0,20160312.0,9500.0,45.972058,-3.143764,-0.023523,-2.366699,0.698012,0.253217,0.000777,0.084079,0.099681,0.079371,1.839166,-2.774615,2.553994,0.924196,-0.272160,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
149997,149997,147587,20101003,60.0,11.0,1.0,1.0,0.0,90.0,6.0,0.0,3302,0,0.0,20160328.0,7500.0,44.733481,-3.105721,0.595454,-2.279091,1.423661,0.233353,0.000705,0.118872,0.100118,0.097914,2.439812,-1.630677,2.290197,1.891922,0.414931,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
149998,149998,45907,20060312,34.0,10.0,3.0,1.0,0.0,156.0,15.0,0.0,1877,0,0.0,20160401.0,4999.0,45.658634,-3.204785,-0.441680,-1.179812,0.620680,0.256369,0.000252,0.081479,0.083558,0.081498,2.075380,-2.633719,1.414937,0.431981,-1.659014,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
df['gearbox'].value_counts()

gearbox
0.0     104864
1.0      31105
0         3373
15.0      1517
-         1387
         ...  
390          1
350          1
236          1
222          1
191          1
Name: count, Length: 302, dtype: int64

In [5]:
df['fuelType'].value_counts()

fuelType
0.0      97137
1.0      47622
2.0       2217
15.0       868
0.5        277
         ...  
394.0        1
124.0        1
300.0        1
137.0        1
230.0        1
Name: count, Length: 174, dtype: int64