# 基于瓜子二手车数据的二手电动车价格分析以及折价率分析模型

In [122]:
import numpy as np
import pandas as pd
import warnings
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.special import jn
from IPython.display import display, clear_output
import time

warnings.filterwarnings('ignore')
%matplotlib inline

## 模型预测的
from sklearn import linear_model
from sklearn import preprocessing
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor

## 数据降维处理的
from sklearn.decomposition import PCA,FastICA,FactorAnalysis,SparsePCA

import lightgbm as lgb
import xgboost as xgb

## 参数搜索和评价的
from sklearn.model_selection import GridSearchCV,cross_val_score,StratifiedKFold,train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

## 数据读取

In [123]:
allData = pd.read_csv('../crawl_for_guazi/data.csv')
print(allData.shape)
allData.head()

(684, 14)


Unnamed: 0,car_name,car_brand,car_tag,price,new_price,complexOutlook,firstCert,odograph,allPower,carBelong,range,isDome,wheelBase,drivingMode
0,比亚迪 汉 2020款 EV 四驱高性能版旗舰型,比亚迪,汉,229000,303388,9成新,2020-08,1.9万公里,363kW,成都(川),550km,1,2920,双电机四驱
1,大众 ID.4 X 2021款 Pro 极智长续航版,大众,ID.4 X,189800,256049,9成新,2021-12,300公里,150kW,德州(鲁),555km,0,2765,后置后驱
2,宝马i3 2018款 豪华型(进口),宝马,宝马i3,138000,368842,8成新,2018-12,6.2万公里,125kW,上海(沪),271km,0,2570,后置后驱
3,特斯拉MODEL S 2014款 MODEL S 85,特斯拉,特斯拉MODEL S,285000,796735,7成新,2015-02,4.7万公里,270kW,烟台(鲁),502km,0,2960,后置后驱
4,江淮iEVA50 2018款 iEVA50 豪华型,江淮,江淮iEVA50,76000,199400,9成新,2019-01,100公里,85kW,合肥(皖),310km,1,2710,前置前驱


In [124]:
allData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 684 entries, 0 to 683
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   car_name        684 non-null    object
 1   car_brand       684 non-null    object
 2   car_tag         684 non-null    object
 3   price           684 non-null    int64 
 4   new_price       684 non-null    int64 
 5   complexOutlook  684 non-null    object
 6   firstCert       684 non-null    object
 7   odograph        684 non-null    object
 8   allPower        684 non-null    object
 9   carBelong       684 non-null    object
 10  range           684 non-null    object
 11  isDome          684 non-null    int64 
 12  wheelBase       684 non-null    int64 
 13  drivingMode     684 non-null    object
dtypes: int64(4), object(10)
memory usage: 74.9+ KB


In [125]:
allData.describe()

Unnamed: 0,price,new_price,isDome,wheelBase
count,684.0,684.0,684.0,684.0
mean,118067.084795,186476.032164,0.790936,2530.099415
std,91347.301067,125929.087693,0.406938,355.928549
min,14300.0,0.0,0.0,1600.0
25%,48950.0,80107.0,1.0,2390.0
50%,82650.0,162711.5,1.0,2610.0
75%,189050.0,272368.75,1.0,2830.0
max,648000.0,905173.0,1.0,3110.0


## 数据清洗

1. 该部分目的为将带单位的属性变为纯数字型属性，如将“7成新”变为“7”，以及时间属性，里程去单位
2. 具体的：
   1. 将成色属性去除尾部中文，并转换为float类型
   2. 将上牌年月属性转为时间单位，并增加一列“使用时间“，表示从首次上牌至今的天数
   3. 表显里程以纯数字形式显示，注意列表中带有”万“时需乘10000
   4. 车辆总功率去单位
   5. 续航里程去单位

### 1. 处理外观成色属性字段，将其转换为float类型import 

In [126]:
def dealOutlookStr(str):
    if type(str) is float:
        return str
    str = str.rstrip('成新')
    if len(str) > 1 and '.' not in str:
        str = float(str) / 10
    return str

allData['complexOutlook'] = allData['complexOutlook'].map(lambda x: np.float(dealOutlookStr(x)))
allData['complexOutlook'].describe()

count    684.000000
mean       8.646491
std        0.792813
min        6.000000
25%        8.000000
50%        9.000000
75%        9.000000
max        9.900000
Name: complexOutlook, dtype: float64

### 2. 将首次上牌修改为pandas.datetime形式，并新增一列表示从上牌至数据获取时(2022-01-26)的天数



In [127]:
allData['firstCert'] = pd.to_datetime(allData['firstCert'],format='%Y-%m')
allData['daysGone'] = ((pd.to_datetime('2022-01-26') - allData['firstCert']).dt.days)
allData['daysGone']

0       543
1        56
2      1152
3      2551
4      1121
       ... 
679    1578
680     117
681     178
682     239
683     970
Name: daysGone, Length: 684, dtype: int64

### 3. 处理表显里程数

In [128]:
def dealOdograph(str):
    if type(str) is float:
        return str
    str = str.rstrip('公里')
    if '万' in str:
        str = str.rstrip('万')
        str = float(str) * 10000
    return float(str)

allData['odograph'] = allData['odograph'].map(dealOdograph)
allData['odograph'].describe()

count       684.000000
mean      21844.444444
std       19784.169577
min         100.000000
25%        6000.000000
50%       17000.000000
75%       32250.000000
max      107000.000000
Name: odograph, dtype: float64

### 4. 处理车辆总功率

In [129]:
allData['allPower'] = allData['allPower'].map(lambda x: float(x.rstrip('kW')))

In [130]:
# 处理车辆续航里程
allData['range'].describe()
# temp = allData['range'].map(lambda x: 'km' in x)
# temp.unique()
def dealRange(str):
    if str is '-' or str is None:
        return None
    if type(str) is int:
        return str
    str = str.rstrip('km')
    return int(str)
allData['range'] = allData['range'].map(dealRange)

### 5. 清洗后的数据描述

In [131]:
allData.describe()

Unnamed: 0,price,new_price,complexOutlook,odograph,allPower,range,isDome,wheelBase,daysGone
count,684.0,684.0,684.0,684.0,684.0,680.0,684.0,684.0,684.0
mean,118067.084795,186476.032164,8.646491,21844.444444,115.126316,359.591176,0.790936,2530.099415,619.273392
std,91347.301067,125929.087693,0.792813,19784.169577,90.036824,145.407965,0.406938,355.928549,459.349804
min,14300.0,0.0,6.0,100.0,15.0,100.0,0.0,1600.0,25.0
25%,48950.0,80107.0,8.0,6000.0,35.0,270.0,1.0,2390.0,270.0
50%,82650.0,162711.5,9.0,17000.0,100.0,353.0,1.0,2610.0,482.0
75%,189050.0,272368.75,9.0,32250.0,160.0,468.0,1.0,2830.0,970.0
max,648000.0,905173.0,9.9,107000.0,577.0,706.0,1.0,3110.0,2551.0
