In [1]:
import pandas as pd
import numpy as np
from deep_translator import GoogleTranslator
from geopy.geocoders import Nominatim
pd.set_option('display.max_rows', None)

def print_data(data,rows=5):
    display(data.head(rows))   

In [2]:
def load_json_data():
    df = pd.read_json('supplier_car.json') 
    return df

In [3]:
def clean_sort_data(data):
    df = data.drop(['entity_id'], axis=1).sort_values(['ID','Attribute Names'])
    return df

In [4]:
def explore_data(data):
    print_data(data)
    display(data['ID'].unique().size)
    display(data['Attribute Names'].unique().size)
    display(data.shape)

In [5]:
def split_pivot_data(data):  
    df1 = data[['ID','MakeText','TypeName','TypeNameFull','ModelText','ModelTypeText']].drop_duplicates().set_index('ID')
    print_data(data)
    display(df1.shape)
        
    df2 = data.pivot(index='ID', columns='Attribute Names', values='Attribute Values').reset_index().rename_axis(None, axis=1).set_index('ID')
    print_data(df2)
    display(df2.shape)
    
    return df1, df2    

In [6]:
def merge_data(dx,dy): 
    data = dx.merge(dy, on='ID')
    print_data(data)
    display(data.shape)
    return data

In [7]:
def map_to_target_data(data):
    return data[
    ['BodyTypeText',
     'BodyColorText',
     'ConditionTypeText',
     'City',
     'MakeText',
     'FirstRegYear',
     'Km',
     'ModelText',
     'ModelTypeText',
     'FirstRegMonth',
    'ConsumptionTotalText']].rename(columns={
                      'BodyTypeText' :'carType',
                      'ConditionTypeText':'condition',
                      'BodyColorText':'color',
                      'City':'city',
                      'MakeText':'make',
                      'FirstRegYear':'manufacture_year',
                      'Km':'mileage',
                      'ModelText':'model',
                      'ModelTypeText':'model_variant',
                      'FirstRegMonth':'manufacture_month',
                      'ConsumptionTotalText':'fuel_consumption_unit'})    

In [8]:
def translate(column):
    items_de = final[~final[column].isna()][column].unique()
    items = {}
    for x in items_de:
        items[x] = GoogleTranslator(source='de', target='en').translate(color).capitalize()
    return items

In [9]:
def normalize_data(data):
    
    colors = translate['color']
    carTypes = translate['carType']
    conditions = translate['condition']
    
    data['color']=final['color'].apply(lambda x: colors[x] if colors.get(x) != None else x)
    data['carType']=final['carType'].apply(lambda x: carTypes[x] if carTypes.get(x) != None else x)
    data['condition']=final['condition'].apply(lambda x: conditions[x] if conditions.get(x) != None else x)
    data['fuel_consumption_unit']=final['fuel_consumption_unit'].apply(lambda x: 'l_km_consumption' if 'l/100km' in x else x)

    return data     

In [10]:
if __name__ == '__main__':
    
    data = load_json_data()
    print_data(data)
    
    #1. Pre-processing
    data_cleaned = clean_sort_data(data)
    explore_data(data)
    dx,dy = split_pivot_data(data)
    pre_procesed_data = merge_data(dx,dy)
    
    #2. Normalisation
    mapped_data = map_to_target_data(pre_procesed_data)
    print_data(mapped_data)
    display(mapped_data.shape)
    normalized_data = normalize_data(mapped_data)
    

Unnamed: 0,ID,MakeText,TypeName,TypeNameFull,ModelText,ModelTypeText,Attribute Names,Attribute Values,entity_id
0,976,MERCEDES-BENZ,McLaren,MERCEDES-BENZ SLR McLaren,SLR,SLR McLaren,Seats,2,0001fda6-192b-46a8-bc08-0e833f904eed
1,1059,MERCEDES-BENZ,ML 350 Inspiration,MERCEDES-BENZ ML 350 Inspiration,ML 350,ML 350 Inspiration,Hp,235,00107c2d-0071-4475-88f0-810133638b7e
2,524,AUDI,S6 Avant quattro 4.2,AUDI S6 Avant quattro 4.2,S6,S6 Avant quattro 4.2,FuelTypeText,Benzin,00126794-a8ef-48fe-93d6-43cfc69fbfb6
3,608,SAAB,9-3 2.0i-16 TS Aero,SAAB 9-3 2.0i-16 TS Aero,9-3,9-3 2.0i-16 TS Aero,Ccm,1985,00182529-1bf7-4f93-89fa-2e8e634b2c9d
4,726,PORSCHE,911 Turbo Cabrio,PORSCHE 911 Turbo Cabrio,911,911 Turbo Cabrio,BodyColorText,schwarz mét.,002d30c2-43f6-4905-868f-160dbc445c56


Unnamed: 0,ID,MakeText,TypeName,TypeNameFull,ModelText,ModelTypeText,Attribute Names,Attribute Values,entity_id
0,976,MERCEDES-BENZ,McLaren,MERCEDES-BENZ SLR McLaren,SLR,SLR McLaren,Seats,2,0001fda6-192b-46a8-bc08-0e833f904eed
1,1059,MERCEDES-BENZ,ML 350 Inspiration,MERCEDES-BENZ ML 350 Inspiration,ML 350,ML 350 Inspiration,Hp,235,00107c2d-0071-4475-88f0-810133638b7e
2,524,AUDI,S6 Avant quattro 4.2,AUDI S6 Avant quattro 4.2,S6,S6 Avant quattro 4.2,FuelTypeText,Benzin,00126794-a8ef-48fe-93d6-43cfc69fbfb6
3,608,SAAB,9-3 2.0i-16 TS Aero,SAAB 9-3 2.0i-16 TS Aero,9-3,9-3 2.0i-16 TS Aero,Ccm,1985,00182529-1bf7-4f93-89fa-2e8e634b2c9d
4,726,PORSCHE,911 Turbo Cabrio,PORSCHE 911 Turbo Cabrio,911,911 Turbo Cabrio,BodyColorText,schwarz mét.,002d30c2-43f6-4905-868f-160dbc445c56


1153

19

(21906, 9)

Unnamed: 0,ID,MakeText,TypeName,TypeNameFull,ModelText,ModelTypeText,Attribute Names,Attribute Values,entity_id
0,976,MERCEDES-BENZ,McLaren,MERCEDES-BENZ SLR McLaren,SLR,SLR McLaren,Seats,2,0001fda6-192b-46a8-bc08-0e833f904eed
1,1059,MERCEDES-BENZ,ML 350 Inspiration,MERCEDES-BENZ ML 350 Inspiration,ML 350,ML 350 Inspiration,Hp,235,00107c2d-0071-4475-88f0-810133638b7e
2,524,AUDI,S6 Avant quattro 4.2,AUDI S6 Avant quattro 4.2,S6,S6 Avant quattro 4.2,FuelTypeText,Benzin,00126794-a8ef-48fe-93d6-43cfc69fbfb6
3,608,SAAB,9-3 2.0i-16 TS Aero,SAAB 9-3 2.0i-16 TS Aero,9-3,9-3 2.0i-16 TS Aero,Ccm,1985,00182529-1bf7-4f93-89fa-2e8e634b2c9d
4,726,PORSCHE,911 Turbo Cabrio,PORSCHE 911 Turbo Cabrio,911,911 Turbo Cabrio,BodyColorText,schwarz mét.,002d30c2-43f6-4905-868f-160dbc445c56


(1153, 5)

Unnamed: 0_level_0,BodyColorText,BodyTypeText,Ccm,City,Co2EmissionText,ConditionTypeText,ConsumptionRatingText,ConsumptionTotalText,Doors,DriveTypeText,FirstRegMonth,FirstRegYear,FuelTypeText,Hp,InteriorColorText,Km,Properties,Seats,TransmissionTypeText
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
1,anthrazit,Limousine,3199,Zuzwil,275 g/km,Occasion,,11.5 l/100km,4,Allrad,1,1999,Benzin,224,grau,31900,"""Ab MFK""",5,Automat
2,anthrazit,Kombi,4991,Zuzwil,333 g/km,Occasion,G,14.0 l/100km,5,Allrad,7,2008,Benzin,580,,25400,"""Ab MFK""",5,Automat sequentiell
3,anthrazit,Kombi,4172,Zuzwil,350 g/km,Occasion,G,14.6 l/100km,5,Allrad,10,2002,Benzin,450,beige,38500,"""Ab MFK""",5,Automat sequentiell
4,anthrazit,Coupé,6162,Zuzwil,291 g/km,Occasion,G,12.7 l/100km,2,Hinterradantrieb,6,2015,Benzin,660,schwarz,200,"""Ab MFK"", ""Direkt-/Parallelimport""",2,Schaltgetriebe manuell
5,anthrazit,SUV / Geländewagen,4806,Zuzwil,270 g/km,Occasion,G,11.5 l/100km,5,Allrad,1,2010,Benzin,500,schwarz,2900,"""Ab MFK""",5,Automat sequentiell


(1153, 19)

Unnamed: 0_level_0,MakeText,TypeName,TypeNameFull,ModelText,ModelTypeText,BodyColorText,BodyTypeText,Ccm,City,Co2EmissionText,...,DriveTypeText,FirstRegMonth,FirstRegYear,FuelTypeText,Hp,InteriorColorText,Km,Properties,Seats,TransmissionTypeText
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
976,MERCEDES-BENZ,McLaren,MERCEDES-BENZ SLR McLaren,SLR,SLR McLaren,silber mét.,Cabriolet,5439,Zuzwil,,...,Hinterradantrieb,10,2007,Benzin,626,schwarz,29800,"""Ab MFK""",2,Automatik-Getriebe
1059,MERCEDES-BENZ,ML 350 Inspiration,MERCEDES-BENZ ML 350 Inspiration,ML 350,ML 350 Inspiration,silber mét.,SUV / Geländewagen,3724,Zuzwil,305 g/km,...,Allrad,10,2003,Benzin,235,schwarz,126300,"""Ab MFK""",5,Automat
524,AUDI,S6 Avant quattro 4.2,AUDI S6 Avant quattro 4.2,S6,S6 Avant quattro 4.2,schwarz,Kombi,4170,Zuzwil,353 g/km,...,Allrad,6,1997,Benzin,290,beige,91900,"""Ab MFK""",5,Automat
608,SAAB,9-3 2.0i-16 TS Aero,SAAB 9-3 2.0i-16 TS Aero,9-3,9-3 2.0i-16 TS Aero,schwarz mét.,Limousine,1985,Zuzwil,216 g/km,...,Vorderradantrieb,9,2001,Benzin,205,schwarz,134600,"""Ab MFK""",5,Schaltgetriebe manuell
726,PORSCHE,911 Turbo Cabrio,PORSCHE 911 Turbo Cabrio,911,911 Turbo Cabrio,schwarz mét.,Cabriolet,3600,Zuzwil,309 g/km,...,Allrad,4,2008,Benzin,480,braun,35500,"""Ab MFK""",2,Schaltgetriebe manuell


(1153, 24)

Unnamed: 0_level_0,carType,color,condition,city,make,manufacture_year,mileage,model,model_variant,manufacture_month,fuel_consumption_unit
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
976,Cabriolet,silber mét.,Occasion,Zuzwil,MERCEDES-BENZ,2007,29800,SLR,SLR McLaren,10,
1059,SUV / Geländewagen,silber mét.,Occasion,Zuzwil,MERCEDES-BENZ,2003,126300,ML 350,ML 350 Inspiration,10,12.7 l/100km
524,Kombi,schwarz,Occasion,Zuzwil,AUDI,1997,91900,S6,S6 Avant quattro 4.2,6,14.9 l/100km
608,Limousine,schwarz mét.,Occasion,Zuzwil,SAAB,2001,134600,9-3,9-3 2.0i-16 TS Aero,9,9.0 l/100km
726,Cabriolet,schwarz mét.,Occasion,Zuzwil,PORSCHE,2008,35500,911,911 Turbo Cabrio,4,12.9 l/100km


(1153, 11)

TypeError: 'function' object is not subscriptable