In [58]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.serif'] = ['SimHei']
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['axes.unicode_minus'] = False

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, confusion_matrix, classification_report, roc_curve


In [17]:
unusedFeatureList = []
featureMostfreqValueDict = {}

badFeatureMaxMissingSample = 500  # 若某个特征缺失的样本数量超过该值，认定为坏特征
badSampleMaxMissingFeature = 10  # 若某个样本缺失的特征超过该值，认定为坏样本

trainDatasetPath = 'dataset/train.csv'
evalDatasetPath = 'dataset/eval.csv'
testDatasetPath = 'dataset/test.csv'


In [31]:
trainDatasetDFOrigin = pd.read_csv(filepath_or_buffer=trainDatasetPath, header=0, index_col=None)

In [7]:
def countDFNull(aimDF):
    nullAmount = aimDF.isnull().sum().sum()
    # print("Null数量 : ", nullAmount)
    return nullAmount

In [18]:
trainDatasetDF = pd.read_csv(filepath_or_buffer=trainDatasetPath, header=0, index_col=None)
print(trainDatasetDF.shape)
print(countDFNull(trainDatasetDF))

(79786, 54)
361556


In [19]:
# 对所有特征（每一列）进行null值统计
trainFeatureNullSeries = trainDatasetDF.isnull().sum().sort_values(ascending=False)  # 降序排列
print("type : ", type(trainFeatureNullSeries))
# averageTrainFeatureNull = trainFeatureNullSeries.sum()/len(trainFeatureNullSeries)
# print("averageTrainFeatureNull : ", averageTrainFeatureNull)
trainFeatureNullDict = trainFeatureNullSeries.to_dict()
badTrainFeatureDict = {key:trainFeatureNullDict[key] for key in trainFeatureNullDict if trainFeatureNullSeries[key] > badFeatureMaxMissingSample}
print("缺失%d个以上样本的特征数量 : "%badFeatureMaxMissingSample, len(badTrainFeatureDict))
print(badTrainFeatureDict)

type :  <class 'pandas.core.series.Series'>
缺失500个以上样本的特征数量 :  13
{'lartpc': 79508, 'larrout': 79277, 'occutc': 78639, 'v2': 73891, 'v1': 14997, 'pr1': 10096, 'pr': 9672, 'circ': 4255, 'voie': 3712, 'vma': 1287, 'vosp': 1034, 'nbv': 1019, 'adr': 808}


In [21]:
unusedFeatureList.extend(badTrainFeatureDict.keys())
print(unusedFeatureList)
trainDatasetDF.drop(columns=badTrainFeatureDict.keys(), inplace=True)
print(trainDatasetDF.shape)

['lartpc', 'larrout', 'occutc', 'v2', 'v1', 'pr1', 'pr', 'circ', 'voie', 'vma', 'vosp', 'nbv', 'adr', 'lartpc', 'larrout', 'occutc', 'v2', 'v1', 'pr1', 'pr', 'circ', 'voie', 'vma', 'vosp', 'nbv', 'adr']
(79786, 41)


In [23]:
trainSampleNullSeries = trainDatasetDF.T.isnull().sum().sort_values(ascending=False) # 倒序排列
trainSampleNullDict = trainSampleNullSeries.to_dict()
print("type : ", type(trainSampleNullSeries))

badTrainSampleDict = {key:trainSampleNullDict[key] for key in trainSampleNullDict if trainSampleNullDict[key] > badSampleMaxMissingFeature}
print("缺失%d个以上特征的样本数量 : "%badSampleMaxMissingFeature, len(badTrainSampleDict))
trainDatasetDF.drop(index=badTrainSampleDict.keys(), inplace=True)
print(trainDatasetDF.shape)

type :  <class 'pandas.core.series.Series'>
缺失10个以上特征的样本数量 :  163
(79623, 41)


In [24]:
print(countDFNull(trainDatasetDF))

264


In [26]:
tooMuchValueFeatureThreshold = 300  # 如果特征的可能的指多于该数，认定为没有参考性
tooLessValueFeatureThreshold = 2  # 如果特征的可能的指少于该数，认定为没有参考性

In [27]:
featureValueCountDict = {}
# 输出各个特征值对应的特征数量
for loopIdx, colName in enumerate(trainDatasetDF):
    tempSeries = trainDatasetDF[colName]
    tempSeriesValueCountDict = tempSeries.value_counts().to_dict()
    # print("特征", colName, "共有%d个可能值"%len(tempSeriesValueCountDict))
    featureValueCountDict[colName] = len(tempSeriesValueCountDict)

print(featureValueCountDict)

tooMuchValueFeatureList = [feature for feature in featureValueCountDict if featureValueCountDict[feature] > tooMuchValueFeatureThreshold]
tooLessValueFeatureList = [feature for feature in featureValueCountDict if featureValueCountDict[feature] < tooLessValueFeatureThreshold]

print("tooMuchValueFeatureList : ", tooMuchValueFeatureList)
print("tooLessValueFeatureList : ", tooLessValueFeatureList)

{'Num_Acc': 48304, 'jour': 31, 'mois': 12, 'an': 1, 'hrmn': 1347, 'lum': 5, 'dep': 107, 'com': 10018, 'agg': 2, 'int': 9, 'atm': 10, 'col': 8, 'lat': 46055, 'long': 46428, 'catr': 8, 'prof': 4, 'plan': 4, 'surf': 9, 'infra': 10, 'situ': 7, 'id_vehicule': 65846, 'num_veh': 27, 'place': 10, 'catu': 3, 'grav': 4, 'sexe': 2, 'an_nais': 103, 'trajet': 8, 'secu1': 11, 'secu2': 11, 'secu3': 10, 'locp': 11, 'actp': 13, 'etatp': 4, 'senc': 5, 'catv': 31, 'obs': 19, 'obsm': 8, 'choc': 11, 'manv': 28, 'motor': 8}
tooMuchValueFeatureList :  ['Num_Acc', 'hrmn', 'com', 'lat', 'long', 'id_vehicule']
tooLessValueFeatureList :  ['an']


In [28]:
unusedFeatureList.extend(tooMuchValueFeatureList)
unusedFeatureList.extend(tooLessValueFeatureList)

print("unusedFeatureList : ", unusedFeatureList)

unusedFeatureList :  ['lartpc', 'larrout', 'occutc', 'v2', 'v1', 'pr1', 'pr', 'circ', 'voie', 'vma', 'vosp', 'nbv', 'adr', 'lartpc', 'larrout', 'occutc', 'v2', 'v1', 'pr1', 'pr', 'circ', 'voie', 'vma', 'vosp', 'nbv', 'adr', 'Num_Acc', 'hrmn', 'com', 'lat', 'long', 'id_vehicule', 'an']


In [30]:
trainDatasetDF.drop(columns=tooMuchValueFeatureList, inplace=True)
trainDatasetDF.drop(columns=tooLessValueFeatureList, inplace=True)
print(trainDatasetDF.shape)
print(countDFNull(trainDatasetDF))

(79623, 34)
264


In [36]:
# 求原始各列众数
trainDatasetOriginModeDict = {}
for colName in trainDatasetDFOrigin:
    trainDatasetOriginModeDict[colName] = trainDatasetDFOrigin[colName].mode()[0]
    print(trainDatasetOriginModeDict)
print(trainDatasetOriginModeDict)

{'Num_Acc': 201900049689, 'jour': 18.0, 'mois': 7.0, 'an': 2019.0, 'hrmn': 0.75, 'lum': 1.0, 'dep': '75', 'com': '75116', 'agg': 2.0, 'int': 1.0, 'atm': 1.0, 'col': 3.0, 'adr': 'AUTOROUTE A86', 'lat': 431213200.0, 'long': 59533100.0, 'catr': 4.0, 'voie': '1', 'v1': 0.0, 'v2': 'D', 'circ': 2.0, 'nbv': 2.0, 'vosp': 0.0, 'prof': 1.0, 'pr': 0.0, 'pr1': 0.0, 'plan': 1.0, 'lartpc': 0.0, 'larrout': 7.0, 'surf': 1.0, 'infra': 0.0, 'situ': 1.0, 'vma': 50.0, 'id_vehicule': '138\xa0212\xa0691', 'num_veh': 'A01', 'place': 1, 'catu': 1, 'grav': 1, 'sexe': 1, 'an_nais': 2000, 'trajet': 5, 'secu1': 1, 'secu2': 0, 'secu3': -1, 'locp': 0, 'actp': '0', 'etatp': -1, 'senc': 1, 'catv': 7, 'obs': 0, 'obsm': 2, 'choc': 1, 'manv': 1, 'motor': 1, 'occutc': 1.0}


In [39]:
objectFeatureList = []
for colName in trainDatasetDF:
    if trainDatasetDF[colName].dtype == 'object':
        objectFeatureList.append(colName)
print(objectFeatureList)

['dep', 'num_veh', 'actp']


In [41]:
unusedFeatureList.extend(objectFeatureList)
print(unusedFeatureList)
trainDatasetDF.drop(columns=objectFeatureList, inplace=True)
print(trainDatasetDF.shape)

['lartpc', 'larrout', 'occutc', 'v2', 'v1', 'pr1', 'pr', 'circ', 'voie', 'vma', 'vosp', 'nbv', 'adr', 'lartpc', 'larrout', 'occutc', 'v2', 'v1', 'pr1', 'pr', 'circ', 'voie', 'vma', 'vosp', 'nbv', 'adr', 'Num_Acc', 'hrmn', 'com', 'lat', 'long', 'id_vehicule', 'an', 'dep', 'num_veh', 'actp', 'dep', 'num_veh', 'actp']
(79623, 31)


In [42]:
print(countDFNull(trainDatasetDF))

264


In [48]:
print(trainDatasetDF.isnull().sum().to_dict())

{'jour': 0, 'mois': 0, 'lum': 0, 'agg': 0, 'int': 0, 'atm': 0, 'col': 0, 'catr': 0, 'prof': 22, 'plan': 13, 'surf': 27, 'infra': 63, 'situ': 139, 'place': 0, 'catu': 0, 'grav': 0, 'sexe': 0, 'an_nais': 0, 'trajet': 0, 'secu1': 0, 'secu2': 0, 'secu3': 0, 'locp': 0, 'etatp': 0, 'senc': 0, 'catv': 0, 'obs': 0, 'obsm': 0, 'choc': 0, 'manv': 0, 'motor': 0}


In [49]:
finalBadSampleDict = {}
trainSampleNullSeries = trainDatasetDF.T.isnull().sum().sort_values(ascending=False) # 倒序排列
trainSampleNullDict = trainSampleNullSeries.to_dict()
print("type : ", type(trainSampleNullSeries))
finalBadSampleDict = {key:trainSampleNullDict[key] for key in trainSampleNullDict if trainSampleNullDict[key] > 0}
print("缺失%d个以上特征的样本数量 : "%0, len(finalBadSampleDict))

type :  <class 'pandas.core.series.Series'>
缺失0个以上特征的样本数量 :  211


In [50]:
# 删除选出的bad sample整行
trainDatasetDF.drop(index=finalBadSampleDict.keys(), inplace=True)
print(trainDatasetDF.shape)

(79412, 31)


In [51]:
countDFNull(trainDatasetDF)

0

In [57]:
trainSplitDF, testSplitDF = train_test_split(trainDatasetDF, test_size=0.15, random_state=42)
print(len(trainSplitDF.groupby('grav').get_group(1)))
print(len(trainSplitDF.groupby('grav').get_group(2)))
print(len(trainSplitDF.groupby('grav').get_group(3)))
print(len(trainSplitDF.groupby('grav').get_group(4)))

28143
1768
10579
27010


In [60]:
trainSplitX = trainSplitDF.drop(columns='grav')
trainSplitY = trainSplitDF.loc[:,'grav']
testSplitX = testSplitDF.drop(columns='grav')
testSplitY = testSplitDF.loc[:,'grav']

In [61]:
print(trainSplitX.shape)
print(trainSplitY.shape)
print(testSplitX.shape)
print(testSplitY.shape)

(67500, 30)
(67500,)
(11912, 30)
(11912,)


In [62]:
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier

preprocessor = make_pipeline(StandardScaler())
RandomPipeline = make_pipeline(preprocessor,RandomForestClassifier(random_state=0))
AdaPipeline = make_pipeline(preprocessor,AdaBoostClassifier(random_state=0))

In [64]:
RandomPipeline.fit(trainSplitX,trainSplitY)
predTestSplitY = RandomPipeline.predict(testSplitX)
print(accuracy_score(testSplitY, predTestSplitY))

0.6756212222968435
