This notebook takes the numveh (base year vehicle numveh data) as an input as prepared by AEMDA and outputs to a new file name. 

Some cleaning of the AEMDA data is required:
1. they put in a bunch of TechID==10 motorbikes at the end, so any duplicates are summed (groupby; agg)
2. there are slight variations with what AEMDA have assigned to technologies. For example, some technologies have different EngineID assigned. Who knows what else. So we will just look up those techs from the technology table and assign the correct parameters for each.

In addition, it adds numveh for vehicletypes 6,7,8.

In [287]:
import pandas as pd

In [288]:
# read aemda output data - vehicle numveh
numveh = pd.read_csv(f'./../out_data/VehicleStock.csv')  # TODO: here we are using the update (Jan 24) without erroneous techIDs
numveh

Unnamed: 0.1,Unnamed: 0,Vehicle category,Segment,CountryID,TechID,Age,TransTypeID,VehTypeID,MassCatID,EngineID,FuelID,NumVeh
0,5,Passenger cars,"PC petrol <1,4L",9,70.0,5,1.0,3.0,1.0,1.0,1.0,84.696857
1,6,Passenger cars,"PC petrol <1,4L",9,70.0,6,1.0,3.0,1.0,1.0,1.0,169.393714
2,7,Passenger cars,"PC petrol <1,4L",9,70.0,7,1.0,3.0,1.0,1.0,1.0,592.878000
3,8,Passenger cars,"PC petrol <1,4L",9,70.0,8,1.0,3.0,1.0,1.0,1.0,677.574857
4,9,Passenger cars,"PC petrol <1,4L",9,70.0,9,1.0,3.0,1.0,1.0,1.0,931.665429
...,...,...,...,...,...,...,...,...,...,...,...,...
4095,8153,Motorcycles,MC 4S 251-750cc,9,10.0,2,1.0,2.0,4.0,6.0,1.0,81.488000
4096,8154,Motorcycles,MC 4S 251-750cc,9,10.0,3,1.0,2.0,4.0,6.0,1.0,61.116000
4097,8155,Motorcycles,MC 4S 251-750cc,9,10.0,4,1.0,2.0,4.0,6.0,1.0,30.558000
4098,8156,Motorcycles,MC 4S 251-750cc,9,10.0,5,1.0,2.0,4.0,6.0,1.0,20.372000


In [289]:
# read technology table for corrections as above
technology = pd.read_csv(f'./../../technology.csv')
technology

Unnamed: 0,TechID,ModeID,VehTypeID,MassCatID,FuelID,EngineID,TransTypeID,Availability,Final_Year,HybridFlag,spec_energ_consump,spec_energ_electric,noise_class,PurchasePrice,ExpectedLifeFactor,Capacity,AveEconLife,SecondHandImportFlag,note
0,10,1,2,4,1,10,1,1980,2000.0,0,6.57,,0,5000.0,1,2.0,10,0,
1,20,1,2,4,12,10,1,1995,2010.0,0,9.51,,2,7000.0,1,2.0,10,0,
2,25,1,2,4,12,11,1,2010,2020.0,0,8.38,,2,6000.0,1,2.0,10,0,
3,26,1,2,4,12,12,1,2020,,0,7.18,,2,5500.0,1,2.0,10,0,
4,30,1,2,4,1,11,1,2000,2010.0,0,5.65,,0,5000.0,1,2.0,10,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2241,14889,1,5,28,13,13,2,2033,2038.0,0,179.20,1.0,2,14000.0,1,2.5,10,1,
2242,14890,1,5,28,13,13,2,2038,2043.0,0,153.89,1.0,2,12000.0,1,2.5,10,1,
2243,14891,1,5,28,13,13,2,2043,2048.0,0,132.15,1.0,2,12000.0,1,2.5,10,1,
2244,14892,1,5,28,13,13,2,2048,2053.0,0,113.48,1.0,2,12000.0,1,2.5,10,1,


In [290]:
# make corrections - in case any mis-alignment of TechID and its attributes.
for i in list(numveh.index.values):
    techid = numveh.TechID[i]
    transtypeid = technology[technology['TechID'] == techid]['TransTypeID']
    vehtypeid = technology[technology['TechID'] == techid]['VehTypeID']
    masscatid = technology[technology['TechID'] == techid]['MassCatID']
    engineid = technology[technology['TechID'] == techid]['EngineID']
    fuelid = technology[technology['TechID'] == techid]['FuelID']
    
    numveh.at[i, 'TransTypeID'] = transtypeid
    numveh.at[i, 'VehTypeID'] = vehtypeid
    numveh.at[i, 'MassCatID'] = masscatid
    numveh.at[i, 'EngineID'] = engineid
    numveh.at[i, 'FuelID'] = fuelid
    
numveh

Unnamed: 0.1,Unnamed: 0,Vehicle category,Segment,CountryID,TechID,Age,TransTypeID,VehTypeID,MassCatID,EngineID,FuelID,NumVeh
0,5,Passenger cars,"PC petrol <1,4L",9,70.0,5,1.0,3.0,1.0,1.0,1.0,84.696857
1,6,Passenger cars,"PC petrol <1,4L",9,70.0,6,1.0,3.0,1.0,1.0,1.0,169.393714
2,7,Passenger cars,"PC petrol <1,4L",9,70.0,7,1.0,3.0,1.0,1.0,1.0,592.878000
3,8,Passenger cars,"PC petrol <1,4L",9,70.0,8,1.0,3.0,1.0,1.0,1.0,677.574857
4,9,Passenger cars,"PC petrol <1,4L",9,70.0,9,1.0,3.0,1.0,1.0,1.0,931.665429
...,...,...,...,...,...,...,...,...,...,...,...,...
4095,8153,Motorcycles,MC 4S 251-750cc,9,10.0,2,1.0,2.0,4.0,10.0,1.0,81.488000
4096,8154,Motorcycles,MC 4S 251-750cc,9,10.0,3,1.0,2.0,4.0,10.0,1.0,61.116000
4097,8155,Motorcycles,MC 4S 251-750cc,9,10.0,4,1.0,2.0,4.0,10.0,1.0,30.558000
4098,8156,Motorcycles,MC 4S 251-750cc,9,10.0,5,1.0,2.0,4.0,10.0,1.0,20.372000


In [291]:
# diagnose any weird fuelIDs in here
for f in list(technology.FuelID.unique()):
    print(f'FuelID: {f} has {len(numveh[numveh.TechID.isin(technology[technology.FuelID==f].TechID)])} entries, total Veh: {numveh[numveh.TechID.isin(technology[technology.FuelID==f].TechID)].NumVeh.sum()}')

FuelID: 1 has 1160 entries, total Veh: 1059506.0
FuelID: 12 has 0 entries, total Veh: 0.0
FuelID: 13 has 0 entries, total Veh: 0.0
FuelID: 2 has 2844 entries, total Veh: 295121.6923076923
FuelID: 8 has 0 entries, total Veh: 0.0
FuelID: 7 has 0 entries, total Veh: 0.0
FuelID: 14 has 0 entries, total Veh: 0.0
FuelID: 5 has 0 entries, total Veh: 0.0
FuelID: 10 has 96 entries, total Veh: 4634.307692307692
FuelID: 15 has 0 entries, total Veh: 0.0


In [292]:
# tell me how many vehicles there are by each type
for v in list(technology.VehTypeID.unique()):
    print(f'VehTypeID: {v} has {len(numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)])} entries, total Veh: {numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)].NumVeh.sum()}')

VehTypeID: 2 has 178 entries, total Veh: 541467.0
VehTypeID: 3 has 1862 entries, total Veh: 532406.0
VehTypeID: 4 has 832 entries, total Veh: 164282.0
VehTypeID: 5 has 1228 entries, total Veh: 121107.0
VehTypeID: 6 has 0 entries, total Veh: 0.0
VehTypeID: 7 has 0 entries, total Veh: 0.0
VehTypeID: 8 has 0 entries, total Veh: 0.0


In [293]:
# TODO: WE ARE FIXING ERRONEOUS DATA: THERE ARE CNG BUSES (FUELID 10) IN THE MIX. WE SHALL REASSIGN THEM TO DIESEL BUSES

# assign each AGE of the CNG buses to the non-CNG buses
rows2drop = []
for a in numveh[numveh.TechID.isin(technology[(technology.FuelID==10) & (technology.MassCatID==13)].TechID)].Age.unique():  # in CNG buses
    
    # return techIDs of CNG buses at this age
    cng_techIDs_for_age = numveh[(numveh.TechID.isin(technology[(technology.FuelID==10) & (technology.MassCatID==13)].TechID)) & (numveh.Age==a)].TechID.unique()      

    # go through each TechID, add the CNG buses to a non-CNG bus
    for t in cng_techIDs_for_age:

        # check there is data in the non-CNG version
        if not numveh[(numveh.TechID.isin(technology[(technology.FuelID!=10) & (technology.MassCatID==13)].TechID)) & (numveh.Age==a)].empty:

            # set a new df to be equal to NON CNG (diesel) buses
            non_cng_bus_df = numveh[(numveh.TechID.isin(technology[(technology.FuelID!=10) & (technology.MassCatID==13)].TechID)) & (numveh.Age==a)]

            # spread them evenly over the TechIDs in the non CNG (diesel) buses
            for i in list(non_cng_bus_df.index.values):
                numveh.loc[i, 'NumVeh'] += numveh[(numveh.TechID==t) & (numveh.Age==a)]['NumVeh'].sum()/len(cng_techIDs_for_age)
            
            # add index to list for dropping 
            for l in list(numveh[(numveh.TechID==t) & (numveh.Age==a)].index.values):
                rows2drop.append(l)

        else:
            print('this is empty!')

numveh.drop(index=rows2drop, inplace=True)


In [294]:
# re-diagnose any weird fuelIDs in here
for f in list(technology.FuelID.unique()):
    print(f'FuelID: {f} has {len(numveh[numveh.TechID.isin(technology[technology.FuelID==f].TechID)])} entries, total Veh: {numveh[numveh.TechID.isin(technology[technology.FuelID==f].TechID)].NumVeh.sum()}')

FuelID: 1 has 1160 entries, total Veh: 1059506.0
FuelID: 12 has 0 entries, total Veh: 0.0
FuelID: 13 has 0 entries, total Veh: 0.0
FuelID: 2 has 2844 entries, total Veh: 299756.0
FuelID: 8 has 0 entries, total Veh: 0.0
FuelID: 7 has 0 entries, total Veh: 0.0
FuelID: 14 has 0 entries, total Veh: 0.0
FuelID: 5 has 0 entries, total Veh: 0.0
FuelID: 10 has 0 entries, total Veh: 0.0
FuelID: 15 has 0 entries, total Veh: 0.0


In [295]:
# and then explore this again - tell me how many vehicles there are by each type
for v in list(technology.VehTypeID.unique()):
    print(f'VehTypeID: {v} has {len(numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)])} entries, total Veh: {numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)].NumVeh.sum()}')

VehTypeID: 2 has 178 entries, total Veh: 541467.0
VehTypeID: 3 has 1862 entries, total Veh: 532406.0
VehTypeID: 4 has 736 entries, total Veh: 164282.0
VehTypeID: 5 has 1228 entries, total Veh: 121107.0
VehTypeID: 6 has 0 entries, total Veh: 0.0
VehTypeID: 7 has 0 entries, total Veh: 0.0
VehTypeID: 8 has 0 entries, total Veh: 0.0


In [296]:
numveh = numveh.iloc[:,3:]
numveh

Unnamed: 0,CountryID,TechID,Age,TransTypeID,VehTypeID,MassCatID,EngineID,FuelID,NumVeh
0,9,70.0,5,1.0,3.0,1.0,1.0,1.0,84.696857
1,9,70.0,6,1.0,3.0,1.0,1.0,1.0,169.393714
2,9,70.0,7,1.0,3.0,1.0,1.0,1.0,592.878000
3,9,70.0,8,1.0,3.0,1.0,1.0,1.0,677.574857
4,9,70.0,9,1.0,3.0,1.0,1.0,1.0,931.665429
...,...,...,...,...,...,...,...,...,...
4095,9,10.0,2,1.0,2.0,4.0,10.0,1.0,81.488000
4096,9,10.0,3,1.0,2.0,4.0,10.0,1.0,61.116000
4097,9,10.0,4,1.0,2.0,4.0,10.0,1.0,30.558000
4098,9,10.0,5,1.0,2.0,4.0,10.0,1.0,20.372000


In [297]:
# read original (UK data) to extract data for ships, planes and trains. Use a simple proportion for each type
numvehuk = pd.read_excel('./../sample_data/VehicleStock.xlsx')
numvehuk

Unnamed: 0,CountryID,TechID,Age,TransTypeID,VehTypeID,MassCatID,EngineID,FuelID,NumVeh
0,9,10,12,1,2,4,10,1,5126.888000
1,9,10,13,1,2,4,10,1,4341.886000
2,9,10,14,1,2,4,10,1,1695.460000
3,9,10,15,1,2,4,10,1,315.866200
4,9,10,16,1,2,4,10,1,93.856110
...,...,...,...,...,...,...,...,...,...
2405,9,5879,7,2,5,28,6,12,0.008200
2406,9,5880,0,2,5,28,14,12,6.176912
2407,9,5880,1,2,5,28,14,12,4.261423
2408,9,5880,2,2,5,28,14,12,0.484696


In [298]:
#Vehicle Type IDs

# VehTypeID	VehTypeAB	VehTypeNA
# 0	WALK	Walking
# 1	BIKE	Bicycle
# 2	MOTO	Motorcycle
# 3	CAR	Car
# 4	BUS	Bus
# 5	TRUCK	Truck
# 6	TRAIN	Train
# 7	SHIP	Ship
# 8	PLANE	Aeroplane

# proportions UK --> Kenya
train_proportion = 0.1
ship_proportion = 0.1
plane_proportion = 0.1

# # return data for trains ships planes (tsp)
# numvehuk_trains = numvehuk[numvehuk.VehTypeID == 6]
# numvehuk_ships = numvehuk[numvehuk.VehTypeID == 7]
# numvehuk_planes = numvehuk[numvehuk.VehTypeID == 8]

numvehuk.loc[numvehuk[numvehuk.VehTypeID == 6].index.tolist(), 'NumVeh'] *= train_proportion
numvehuk.loc[numvehuk[numvehuk.VehTypeID == 7].index.tolist(), 'NumVeh'] *= ship_proportion
numvehuk.loc[numvehuk[numvehuk.VehTypeID == 8].index.tolist(), 'NumVeh'] *= plane_proportion

In [299]:
numveh = numveh.append(numvehuk[numvehuk.VehTypeID.isin([6,7,8])])
numveh = numveh.reset_index(drop=True)

  numveh = numveh.append(numvehuk[numvehuk.VehTypeID.isin([6,7,8])])


In [300]:
for v in list(technology.VehTypeID.unique()):
    print(f'VehTypeID: {v} has {len(numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)])} entries, total Veh: {numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)].NumVeh.sum()}')

VehTypeID: 2 has 178 entries, total Veh: 541467.0
VehTypeID: 3 has 1862 entries, total Veh: 532406.0
VehTypeID: 4 has 736 entries, total Veh: 164282.0
VehTypeID: 5 has 1228 entries, total Veh: 121107.0
VehTypeID: 6 has 313 entries, total Veh: 707.3098297988581
VehTypeID: 7 has 84 entries, total Veh: 50.100001434694484
VehTypeID: 8 has 251 entries, total Veh: 110.7788481977615


In [301]:
numveh

Unnamed: 0,CountryID,TechID,Age,TransTypeID,VehTypeID,MassCatID,EngineID,FuelID,NumVeh
0,9,70.0,5,1.0,3.0,1.0,1.0,1.0,8.469686e+01
1,9,70.0,6,1.0,3.0,1.0,1.0,1.0,1.693937e+02
2,9,70.0,7,1.0,3.0,1.0,1.0,1.0,5.928780e+02
3,9,70.0,8,1.0,3.0,1.0,1.0,1.0,6.775749e+02
4,9,70.0,9,1.0,3.0,1.0,1.0,1.0,9.316654e+02
...,...,...,...,...,...,...,...,...,...
4647,9,4275.0,16,1.0,8.0,45.0,7.0,15.0,9.028010e-02
4648,9,4275.0,32,1.0,8.0,45.0,7.0,15.0,3.025660e-12
4649,9,4275.0,33,1.0,8.0,45.0,7.0,15.0,3.782154e-16
4650,9,4275.0,34,1.0,8.0,45.0,7.0,15.0,2.250437e-21


In [302]:
# get rid of duplicates - there are multiple instances where a TechID and age are shared.
numveh = numveh.groupby([c for c in numveh.columns if c != 'NumVeh'])['NumVeh'].agg([('NumVeh', 'sum')]).reset_index()
# numveh = numveh.groupby([c for c in numveh.columns if c != 'NumVeh'],as_index=False).agg({'NumVeh': 'sum'})

In [303]:
for v in list(technology.VehTypeID.unique()):
    print(f'VehTypeID: {v} has {len(numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)])} entries, total Veh: {numveh[numveh.TechID.isin(technology[technology.VehTypeID==v].TechID)].NumVeh.sum()}')

VehTypeID: 2 has 27 entries, total Veh: 541467.0
VehTypeID: 3 has 1862 entries, total Veh: 532405.9999999999
VehTypeID: 4 has 702 entries, total Veh: 164282.00000000003
VehTypeID: 5 has 904 entries, total Veh: 121106.99999999997
VehTypeID: 6 has 313 entries, total Veh: 707.3098297988581
VehTypeID: 7 has 84 entries, total Veh: 50.100001434694484
VehTypeID: 8 has 251 entries, total Veh: 110.7788481977615


In [304]:
numveh.to_csv('./../out_data/processed/numveh95.csv', index=False)

In [305]:
numveh[numveh.duplicated(subset=[c for c in numveh.columns if c != 'NumVeh'])]

Unnamed: 0,CountryID,TechID,Age,TransTypeID,VehTypeID,MassCatID,EngineID,FuelID,NumVeh


In [306]:
numveh[numveh.duplicated(subset=['TechID','Age'])]

Unnamed: 0,CountryID,TechID,Age,TransTypeID,VehTypeID,MassCatID,EngineID,FuelID,NumVeh
