# Data Cleaning and Feature engineering

In [1]:
import pandas as pd
import numpy as np
import re
import time
import requests as rq
import bs4 as bs4

pd.set_option('max_colwidth', None)
pd.set_option('display.max_rows', None)

pd.set_option('max_columns', 28)


In [2]:
df = pd.read_json('parsed_link_info.json',lines = True)
df.tail(2)

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra
6860,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_wv-voyage-2016-confortiline-parcelas-fixas-de-r-615-00-714041737,vwvolkswagen,27900,0,voyage,0,2016,78321,0.0,flex,0,0,0,0,0,0,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré"
6861,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_wv-voyage-completo-trend-1-6-2018-715280723,vwvolkswagen,38900,sedã,voyage,manual,2018,36000,1.6,flex,hidráulica,preto,não,voyage trend 1.6 mi total flex 8v 4p,4 portas,ipva pago,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré"


In [3]:
df.loc[df.price == '',:].tail(2)

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra
6792,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_vw-volkswagen-saveiro-1-6-completa-muito-nova-sem-detalhes-737580066,vwvolkswagen,,passeio,saveiro,manual,2014,55000,0.0,flex,0,prata,0,saveiro trendline 1.6 t.flex 8v,0,0,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré, c mera de ré"
6803,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_vw-volkswagen-up-higth-ma-1-0-2015-580751777,vwvolkswagen,,hatch,up,manual,2015,1,1.0,flex,hidráulica,preto,sim,up high 1.0 total flex 12v 5p,4 portas,0,"vidro elétrico, air bag, trava elétrica, ar condicionado, alarme, som"


# Basic  Data exploration

I noticed that some cells are empty.After analyzing it I came up with the conclusion that most of the cars with no price are on the website for exchange.  Also, I was wondering how could someone forget to add the price of the car when selling it on a website? This might be intentional, this person might not really want to sell the car and unloaded it in order to check if someone would offer a value. Anyway, I will delete those cars. 

In [4]:
# All rows with empty space (index)
empty_price = np.where(df.applymap(lambda x: x == ''))[0]
empty_price

array([  18,   19,   25,   26,   27,   28,   31,   34,   49,   52,   85,
        114,  159,  160,  171,  180,  245,  259,  412,  588, 1020, 1052,
       1099, 1954, 2194, 2197, 2238, 2488, 2491, 2569, 2570, 2571, 2572,
       2637, 2647, 2737, 2781, 2784, 2954, 2960, 3044, 3222, 3254, 3270,
       3367, 3537, 3652, 3771, 3891, 3892, 4176, 4217, 4220, 4228, 4232,
       4313, 4318, 4404, 4405, 4416, 4558, 4571, 4579, 4595, 4661, 4667,
       4669, 4670, 4740, 4811, 4817, 4882, 4885, 4908, 4940, 4977, 5032,
       5069, 5123, 5136, 5137, 5265, 5294, 5327, 5760, 6145, 6233, 6398,
       6442, 6457, 6508, 6560, 6596, 6645, 6662, 6663, 6679, 6685, 6686,
       6688, 6698, 6730, 6767, 6788, 6791, 6792, 6803])

In [5]:
# drop rows with empty price
df.drop(index = empty_price , inplace = True)
# reset index
df = df.reset_index(drop=True)
df.head(2)

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_-unica-dona-729128175,ford,35900,hatch,ka,manual,2017,33000,1.0,flex,hidráulica,preto,sim,ka 1.0 se se plus tivct flex 5p,4 portas,ipva pago,"vidro elétrico, air bag, ar condicionado, direção hidráulica, alarme, som, sensor de ré"
1,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_0vendo-gol-1-0-com-ar-condicionado-baixo-km-734865745,vwvolkswagen,16400,0,gol,manual,2013,52000,0.0,flex,0,0,sim,gol 1.0 plus 8v 2p,0,ipva pago,0


In [6]:
# Sanity check
df.iloc[[18]]

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra
18,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_alo-uber-ka-1-5-sedan-2015-novissimo-729280853,ford,33990,sedã,ka,manual,2015,80000,1.5,flex,elétrica,branco,sim,ka sedan 1.5 16v flex 4p,4 portas,0,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme"


In [7]:
# shuffle dataframe
#from sklearn.utils import shuffle
#df = shuffle(df).reset_index(drop=True)

In [8]:
# save to csv
#df.to_csv('car_information.csv')

Impot to notice that there is no nan variable but it does not mean that we don't have missing value. In fact, the zeros represent the missing values.

In [9]:
df.isna().sum()

link            0
brand           0
price           0
cartype         0
model           0
gearbox         0
regdate         0
mileage         0
motorpower      0
fuel            0
car_steering    0
carcolor        0
exchange        0
version         0
doors           0
financial       0
extra           0
dtype: int64

In [10]:
# Sanity check
df.iloc[[18]]

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra
18,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_alo-uber-ka-1-5-sedan-2015-novissimo-729280853,ford,33990,sedã,ka,manual,2015,80000,1.5,flex,elétrica,branco,sim,ka sedan 1.5 16v flex 4p,4 portas,0,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme"


In [11]:
df.astype({'price': 'int64',
           'regdate':'int',
          'mileage':'int64'}).dtypes

link            object
brand           object
price            int64
cartype         object
model           object
gearbox         object
regdate          int64
mileage          int64
motorpower      object
fuel            object
car_steering    object
carcolor        object
exchange        object
version         object
doors           object
financial       object
extra           object
dtype: object

Delete Rows where price, model and mileage equals zero.

In [12]:
# Rows where price equals zero - index
zero_price = df.loc[df.price == 0 , : ].index
zero_price

Int64Index([ 194,  465, 1072, 1239, 1418, 1443, 1673, 1909, 1911, 2037, 2038,
            2039, 2040, 2229, 2340, 2345, 2511, 2612, 2647, 2650, 2743, 2816,
            2833, 3053, 3281, 3400, 3402, 3423, 3476, 3719, 3727, 3887, 4426,
            4717, 4740, 5299, 5565, 5573, 5592, 5604, 5729, 5829, 5855, 5856,
            5874, 6365, 6403, 6509, 6529, 6540, 6697, 6742],
           dtype='int64')

In [13]:
def del_rows(df, features):
    '''Deletes rows where specified feature is zero
    
    ARG: 
    df(dataframe): The dataframe that will be processed
    features(list): List of strings containing the features that zero  needs to be deleted
    
    RETURNS:
    df_drop(dataframe): The dataframe with specific rows deleted'''
    
    df_drop = df.copy()
    
    for feature in features:
        # Rows where price equals zero - index
        zeros_index = df_drop.loc[df_drop[feature] == 0 , : ].index
        # drow rows
        df_drop = df_drop.drop(zeros_index, axis = 0)
        
    df_drop = df_drop.reset_index(drop=True)
        
    return df_drop
        
        

In [14]:
colnames= ['price','model', 'mileage']

df = del_rows(df,colnames )

# data cleaning and data Engineering

## part 1 - Create column for each extra feature:

In order to identify all extra information I have, I will check the length of the extra column, identify the greates string lenght and see each extra feature it contains. That way, I can create specific columns for each of those feature. 

In [15]:
df['extra_len']= [len(x) if type(x) == str else x for x in df.extra]
df.head(2)

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra,extra_len
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_-unica-dona-729128175,ford,35900,hatch,ka,manual,2017,33000,1.0,flex,hidráulica,preto,sim,ka 1.0 se se plus tivct flex 5p,4 portas,ipva pago,"vidro elétrico, air bag, ar condicionado, direção hidráulica, alarme, som, sensor de ré",87.0
1,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_0vendo-gol-1-0-com-ar-condicionado-baixo-km-734865745,vwvolkswagen,16400,0,gol,manual,2013,52000,0.0,flex,0,0,sim,gol 1.0 plus 8v 2p,0,ipva pago,0,0.0


In [16]:
df['extra_len'].max()

127.0

In [17]:
pd.set_option('max_colwidth', 128)
greater_length = df['extra_len'].max()
df.loc[df.extra_len == greater_length, :].extra.head()

101     vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré, blindado, c mera de ré
883     vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré, blindado, c mera de ré
1502    vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré, blindado, c mera de ré
1689    vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré, blindado, c mera de ré
1710    vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré, blindado, c mera de ré
Name: extra, dtype: object

- Looking at the series above we can see all the extra features we can convert into columns.Let's do it.


In [18]:
# Select features as string
features = df.loc[df.extra_len == greater_length, :].extra.head(1).values
features[0]

'vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré, blindado, c mera de ré'

In [19]:
len(features[0].rsplit(','))

10

In [20]:
# Unique feature
features[0].rsplit(',')[0].strip()

'vidro elétrico'

In [21]:
# Create column for each feature in extra column
for feature in range(10):
    
    colname= features[0].rsplit(',')[feature].strip()
    df[colname] = 0.0
    


In [22]:
# drop extra_len column
df.drop('extra_len', axis = 1, inplace = True)
df.columns

Index(['link', 'brand', 'price', 'cartype', 'model', 'gearbox', 'regdate',
       'mileage', 'motorpower', 'fuel', 'car_steering', 'carcolor', 'exchange',
       'version', 'doors', 'financial', 'extra', 'vidro elétrico', 'air bag',
       'trava elétrica', 'ar condicionado', 'direção hidráulica', 'alarme',
       'som', 'sensor de ré', 'blindado', 'c mera de ré'],
      dtype='object')

In [23]:
# find the index of the column extra
columns = list(df.columns)
index = columns.index('extra')
index

16

## seting values into new feature columns

In [24]:
df.columns[(index+1):]

Index(['vidro elétrico', 'air bag', 'trava elétrica', 'ar condicionado',
       'direção hidráulica', 'alarme', 'som', 'sensor de ré', 'blindado',
       'c mera de ré'],
      dtype='object')

In [25]:
def fill_in_the_features(df):
    '''Fills in the feature cells stating whether the car contains the respective feature
    
    ARG:
    df(dataframe): The dataframe to be filled in
    
    RETURNS:
    df(dataframe): the dataframe with filled columns - 1 car has the feature and 0 car does not
    '''

    for feature in df.columns[17:]:

        total_rows = df.shape[0]

        for row in range(total_rows):

            is_zero = (df.extra[row] == 0)

            if is_zero == True:

                df[feature].values[row] = 0.0

            else:

                contains_feature = feature in df.extra[row]

                if contains_feature == True:
                    df[feature].values[row] = 1
                else:
                    continue

    
    return df
    
    

In [26]:
df = fill_in_the_features(df)
df.head(4)

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra,vidro elétrico,air bag,trava elétrica,ar condicionado,direção hidráulica,alarme,som,sensor de ré,blindado,c mera de ré
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_-unica-dona-729128175,ford,35900,hatch,ka,manual,2017,33000,1.0,flex,hidráulica,preto,sim,ka 1.0 se se plus tivct flex 5p,4 portas,ipva pago,"vidro elétrico, air bag, ar condicionado, direção hidráulica, alarme, som, sensor de ré",1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_0vendo-gol-1-0-com-ar-condicionad...,vwvolkswagen,16400,0,gol,manual,2013,52000,0.0,flex,0,0,sim,gol 1.0 plus 8v 2p,0,ipva pago,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_1-6-fiesta-2012-2013-730821135,ford,23999,hatch,fiesta,manual,2013,76000,1.6,flex,hidráulica,branco,não,fiesta 1.6 8v flex class 1.6 8v flex 5p,4 portas,ipva pago,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som",1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0
3,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_13-13-completo-km-35-000-2020-vis...,vwvolkswagen,24899,sedã,voyage,manual,2013,35000,1.0,flex,hidráulica,preto,0,voyage trendline 1.0 t.flex 8v 4p,4 portas,de leilão,"vidro elétrico, trava elétrica, ar condicionado, direção hidráulica",1.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# delete extra column
df.drop('extra', axis = 1, inplace = True)

In [28]:
# Set column names
df.rename(columns = {'air bag': 'air_bag',
          'trava elétrica':'trava_elétrica',
          'ar condicionado':'ar_condicionado',
          'direção hidráulica':'direção_hidráulica',
          'sensor de ré':'sensor_de_ré',
          'c mera de ré':'camera_de_ré'}, inplace = True)
df.head(2)
          
          
          

Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,vidro elétrico,air_bag,trava_elétrica,ar_condicionado,direção_hidráulica,alarme,som,sensor_de_ré,blindado,camera_de_ré
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_-unica-dona-729128175,ford,35900,hatch,ka,manual,2017,33000,1.0,flex,hidráulica,preto,sim,ka 1.0 se se plus tivct flex 5p,4 portas,ipva pago,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
1,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_0vendo-gol-1-0-com-ar-condicionad...,vwvolkswagen,16400,0,gol,manual,2013,52000,0.0,flex,0,0,sim,gol 1.0 plus 8v 2p,0,ipva pago,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Part 2 - Dummy Variables

### Financial

- ipva pago : yearly fees paid
- financiado: Still paying the car
- de leilão: auto auction
- com multas: Penalty Charge Notice (PCN)

### brand 

- vwvolkswagen
- ford
- renaut (?) ;)

In [29]:
df.brand.value_counts()

vwvolkswagen    3397
ford            2970
renault            1
Name: brand, dtype: int64

### cartype
- hatch
- passeio 
- sedã 
- suv
- pickup 
- vanutilitário 
- antigo 
- conversvel

In [30]:
df.cartype.value_counts()

hatch            2351
passeio          1551
sedã             1081
0.0               664
suv               382
pickup            251
vanutilitário      86
conversvel          1
antigo              1
Name: cartype, dtype: int64

### Model

- ka              
- fiesta          
- fox             
- gol             
- voyage          
- ecosport        
- focus           
- up              
- saveiro         
- polo            
- crossfox        
- jetta           
- golf             
- fusion           
- spacefox         
- ranger           
- virtus           
- kombi            
- spacecross       
- tiguan           
- escort            
- passat            
- edge              
- amarok            
- variant           
- fusca             
- parati            
- f100              
- fluence           
- grandsaveiro      
- courier           


### gearbox
- manual
- automático
- semiautomático

In [31]:
df.gearbox.value_counts()

manual            4899
automático        1343
semiautomático      82
0.0                 44
Name: gearbox, dtype: int64

### motorpower
- 1.6          
- 1.0          
- 0.0       ---------> missing    
- 2.02.9    -----> 2.0 - 2.9     
- 1.5           
- 1.4           
- 1.8             
- 3.03.9    -------> 3.0 -3.9      
- 1.3              
- 4.0oumais --> 4.0+      
- 1.7             

In [32]:
df.motorpower.value_counts()

1.6          2376
1.0          1944
0.0           878
2.02.9        619
1.5           443
1.4            92
1.8             7
3.03.9          5
1.3             2
4.0oumais       1
1.7             1
Name: motorpower, dtype: int64

### Fuel

- flex - dual-fuel vehicle
- 0.0 - missing information
- gásnatural - NGVs


In [33]:
df.fuel.value_counts()

flex          6367
gásnatural       1
Name: fuel, dtype: int64

### car_steering

- hidráulica    ---- hydraulic
- elétrica      ------- electric power steering  
- 0.0           ------------- missing
- mecnica       ------ mechanical Steering 
- assistida     ------ power-assisted steering system

In [34]:
df.car_steering.value_counts()

hidráulica    3410
elétrica      1905
0.0            882
mecnica        161
assistida       10
Name: car_steering, dtype: int64

### Carcolor

- branco    -  white
- prata     -  silver
- preto     - black
- 0.0       -   missing
- vermelho  -   red
- cinza     -   gray
- azul      -   blue
- outra     -    others
- laranja   -    orange
- amarelo   -    yellow
- verde     -     green

In [35]:
df.carcolor.value_counts()

branco      1980
prata       1425
preto       1078
0.0          803
vermelho     573
cinza        298
azul         111
outra         65
laranja       17
amarelo       12
verde          6
Name: carcolor, dtype: int64

### exchange
- sim - yes
- não - no
- 0.0 - missing

In [36]:
df.exchange.value_counts()

sim    3355
não    1680
0.0    1333
Name: exchange, dtype: int64

### - create dummy variables

In [37]:
to_dummies = ['financial', 'brand', 'cartype', 'model','gearbox', 'motorpower', 'fuel', 'car_steering','carcolor',
             'exchange']


df = pd.get_dummies(df, columns = to_dummies, drop_first = True )

In [38]:
df.head(2)

Unnamed: 0,link,price,regdate,mileage,version,doors,vidro elétrico,air_bag,trava_elétrica,ar_condicionado,direção_hidráulica,alarme,som,sensor_de_ré,...,car_steering_hidráulica,car_steering_mecnica,carcolor_amarelo,carcolor_azul,carcolor_branco,carcolor_cinza,carcolor_laranja,carcolor_outra,carcolor_prata,carcolor_preto,carcolor_verde,carcolor_vermelho,exchange_não,exchange_sim
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_-unica-dona-729128175,35900,2017,33000,ka 1.0 se se plus tivct flex 5p,4 portas,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1,0,0,0,0,0,0,0,0,1,0,0,0,1
1,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_0vendo-gol-1-0-com-ar-condicionad...,16400,2013,52000,gol 1.0 plus 8v 2p,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1


### Doors
- 4 portas - 4 doors
- 2 portas - 2 doors
- 0  - 2 doors

In [39]:
df.doors.value_counts()

4 portas    5529
2 portas     536
0.0          303
Name: doors, dtype: int64

In [40]:
df['doors'] = df['doors'].str.extract(r"(\d)", expand = False)
# replace nan (previously zero) with 2
df['doors'].fillna(2, inplace = True)
# convert to int
df['doors'] = df['doors'].astype(int)
df.head(2)

Unnamed: 0,link,price,regdate,mileage,version,doors,vidro elétrico,air_bag,trava_elétrica,ar_condicionado,direção_hidráulica,alarme,som,sensor_de_ré,...,car_steering_hidráulica,car_steering_mecnica,carcolor_amarelo,carcolor_azul,carcolor_branco,carcolor_cinza,carcolor_laranja,carcolor_outra,carcolor_prata,carcolor_preto,carcolor_verde,carcolor_vermelho,exchange_não,exchange_sim
0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_-unica-dona-729128175,35900,2017,33000,ka 1.0 se se plus tivct flex 5p,4,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,...,1,0,0,0,0,0,0,0,0,1,0,0,0,1
1,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_0vendo-gol-1-0-com-ar-condicionad...,16400,2013,52000,gol 1.0 plus 8v 2p,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,1


# Data Labeling

For this project, I decided to label the training data manually. I literally opened the file on a spreadsheet and set 1 for a car I would like to buy or at least see the pictures and zero for cars I would not buy. I could have used active learning techniques for data labeling but I decided, for this project in specific, to do it manually.

In [41]:
df = pd.read_csv('car_information.csv')
df.head(2)

Unnamed: 0.1,Unnamed: 0,link,brand,price,cartype,model,gearbox,regdate,mileage,motorpower,fuel,car_steering,carcolor,exchange,version,doors,financial,extra,Y
0,0,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_ford-ka-hatch-2018-unico-dono-com...,ford,31900,hatch,ka,manual,2015,39869,1.0,flex,elétrica,preto,sim,ka 1.0 se se plus tivct flex 5p,4 portas,ipva pago,"vidro elétrico, air bag, trava elétrica, ar condicionado, direção hidráulica, alarme, som, sensor de ré",1.0
1,1,https://rj.olx.com.br/rio-de-janeiro-e-regiao/autos-e-pecas/carros-vans-e-utilitarios/link_gol-g6-12-13-714466730,vwvolkswagen,26000,passeio,gol,manual,2013,109000,1.6,flex,hidráulica,prata,sim,gol 1.6 mi plus total flex 8v 4p,4 portas,0,"vidro elétrico, trava elétrica, ar condicionado, direção hidráulica, som",1.0


As we can see below, we have a Imbalanced classes and we in have to take this into account when building the model

In [42]:

df.Y.value_counts().plot(kind = 'bar');

### Version

I will use the sklearn TfidfVectorizer transformer that convert raw documents to a matrix of TF-IDF features.

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [51]:
df.version.head()

0       ka 1.0 se se plus tivct flex 5p
1      gol 1.6 mi plus total flex 8v 4p
2            fox pepper 1.6 flex 16v 5p
3                   fiesta class 1.0 4p
4    ecosport freestyle 1.6 16v flex 5p
Name: version, dtype: object

In [46]:
version_vec = TfidfVectorizer(min_df = 2)
version_bow = version_vec

In [57]:
feat = 'any_name'

In [60]:
"{name}.pickle".format(name = feat+'_bow')

'any_name_bow.pickle'

In [48]:
#display(df.version.str.contains('1.0').sum())
#df.version.str.contains('1.6').sum()
#df.version.str.contains('2.0').sum()