In [177]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
import warnings
warnings.filterwarnings('ignore')

In [178]:
ms1 = pd.read_csv('measurements.csv')
ms2 = pd.read_excel('measurements2.xlsx')

In [179]:
ms1.head(1)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45,E10


In [180]:
ms2.head(1)

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10


In [181]:
ms = pd.concat([ms1, ms2], axis = 0)

In [182]:
ms.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 776 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   distance       776 non-null    object
 1   consume        776 non-null    object
 2   speed          776 non-null    int64 
 3   temp_inside    752 non-null    object
 4   temp_outside   776 non-null    int64 
 5   specials       186 non-null    object
 6   gas_type       776 non-null    object
 7   AC             776 non-null    int64 
 8   rain           776 non-null    int64 
 9   sun            776 non-null    int64 
 10  refill liters  26 non-null     object
 11  refill gas     26 non-null     object
dtypes: int64(5), object(7)
memory usage: 78.8+ KB


In [183]:
nan_cols = ms.isna().mean()*100

nan_cols[nan_cols > 0]

temp_inside       3.092784
specials         76.030928
refill liters    96.649485
refill gas       96.649485
dtype: float64

The above formula let us know the percentage of null values in those columns where there are null values. As we can see, refill liters and refill gas have a very high percentage of null values, which make it very difficult for us to fill those values with an accurate value. That is why we will be dropping these columns.

In [184]:
ms.drop(columns = ['refill liters', 'refill gas'], inplace = True)

In [185]:
ms.temp_inside.value_counts()

21,5    133
21.5    133
22      102
22.0    102
22.5     59
22,5     59
20.0     25
20       25
21       13
23       13
21.0     13
23.0     13
25.0     12
25       12
24.5      7
24,5      7
20.5      4
20,5      4
24.0      3
24        3
23.5      2
23,5      2
25,5      2
25.5      2
19.0      1
19        1
Name: temp_inside, dtype: int64

Here we can see that there are different formats for each value, besides, this column's format is object and shouls be float.

In [186]:
tmp_in = []

for e in ms.temp_inside:
    e = str(e)
    e = e.replace(',','.').strip()
    e = float(e)
    tmp_in.append(e)

In [187]:
ms.temp_inside = tmp_in

In [188]:
ms.temp_inside.value_counts()

21.5    266
22.0    204
22.5    118
20.0     50
21.0     26
23.0     26
25.0     24
24.5     14
20.5      8
24.0      6
23.5      4
25.5      4
19.0      2
Name: temp_inside, dtype: int64

In [189]:
ms.temp_inside.mean(), float(ms.temp_inside.mode()), ms.temp_inside.median()

(21.929521276595743, 21.5, 22.0)

Now that this columns has its values unified, lets highlight that all the values are between 19.0 and 25.0, and the median, mode and mean are very close to each other. The decision taken will be to fill the null values of this column with the most common value: the mean.

In [190]:
ms.temp_inside = ms.temp_inside.fillna(float(ms.temp_inside.mode()))

In [191]:
nan_cols = ms.isna().sum()

nan_cols[nan_cols > 0]

specials    590
dtype: int64

In [192]:
ms.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun
0,28,5,26,21.5,12,,E10,0,0,0
1,12,42,30,21.5,13,,E10,0,0,0
2,112,55,38,21.5,15,,E10,0,0,0
3,129,39,36,21.5,14,,E10,0,0,0
4,185,45,46,21.5,15,,E10,0,0,0


In [194]:
sp = []
for e in ms.specials:
    e = str(e)
    e = e.upper()
    if e == 'NAN':
        sp.append('no specials')
    else:
        sp.append(e)

In [195]:
ms.specials = sp

In [196]:
ds = []
for e in ms.distance:
    e = str(e)
    e = e.replace(',','.')
    ds.append(float(e))

In [197]:
cs = []
for e in ms.consume:
    e = str(e)
    e = e.replace(',','.')
    cs.append(float(e))

In [198]:
ms.distance = ds
ms.consume = cs

In [199]:
ms.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun
0,28.0,5.0,26,21.5,12,no specials,E10,0,0,0
1,12.0,4.2,30,21.5,13,no specials,E10,0,0,0
2,11.2,5.5,38,21.5,15,no specials,E10,0,0,0
3,12.9,3.9,36,21.5,14,no specials,E10,0,0,0
4,18.5,4.5,46,21.5,15,no specials,E10,0,0,0


In [200]:
ms.to_csv('measurements-cleaned.csv', index = False)