In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer
import yaml

In [2]:
try:
    with open ("../config.yaml", 'r') as file:
        config = yaml.safe_load(file)
except Exception as e:
    print('Error reading the config file')

In [3]:
file1 = pd.read_csv(config['data']+'measurements.csv')
file1.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45.0,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,


In [4]:
file2 = pd.read_excel(config['data']+'measurements2.xlsx')
file2.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,


In [5]:
file1.tail()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
383,16,37,39,245,18,,SP98,0,0,0,,
384,161,43,38,25,31,AC,SP98,1,0,0,,
385,16,38,45,25,19,,SP98,0,0,0,,
386,154,46,42,25,31,AC,SP98,1,0,0,,
387,147,5,25,25,30,AC,SP98,1,0,0,,


In [6]:
file2.tail()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
383,16.0,3.7,39,24.5,18,,SP98,0,0,0,,
384,16.1,4.3,38,25.0,31,AC,SP98,1,0,0,,
385,16.0,3.8,45,25.0,19,,SP98,0,0,0,,
386,15.4,4.6,42,25.0,31,AC,SP98,1,0,0,,
387,14.7,5.0,25,25.0,30,AC,SP98,1,0,0,,


In [7]:
file1.shape

(388, 12)

In [8]:
file2.shape

(388, 12)

In [9]:
file1.describe()

Unnamed: 0,speed,temp_outside,AC,rain,sun
count,388.0,388.0,388.0,388.0,388.0
mean,41.927835,11.358247,0.07732,0.123711,0.082474
std,13.598524,6.991542,0.267443,0.329677,0.275441
min,14.0,-5.0,0.0,0.0,0.0
25%,32.75,7.0,0.0,0.0,0.0
50%,40.5,10.0,0.0,0.0,0.0
75%,50.0,16.0,0.0,0.0,0.0
max,90.0,31.0,1.0,1.0,1.0


In [10]:
file2.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,refill liters
count,388.0,388.0,388.0,376.0,388.0,388.0,388.0,388.0,13.0
mean,19.652835,4.912371,41.927835,21.929521,11.358247,0.07732,0.123711,0.082474,37.115385
std,22.667837,1.033172,13.598524,1.010455,6.991542,0.267443,0.329677,0.275441,8.587282
min,1.3,3.3,14.0,19.0,-5.0,0.0,0.0,0.0,10.0
25%,11.8,4.3,32.75,21.5,7.0,0.0,0.0,0.0,37.6
50%,14.6,4.7,40.5,22.0,10.0,0.0,0.0,0.0,38.0
75%,19.0,5.3,50.0,22.5,16.0,0.0,0.0,0.0,39.0
max,216.1,12.2,90.0,25.5,31.0,1.0,1.0,1.0,45.0


## Looks like the 2 datasets could contain the same information (head and tail are the same, and the shape and descriptions are identical), but the excel file has already a . instead of a , as a separator for decimals so will use the second to save time in cleaning that.

In [11]:
data = file2.copy()

In [12]:
data.columns

Index(['distance', 'consume', 'speed', 'temp_inside', 'temp_outside',
       'specials', 'gas_type', 'AC', 'rain', 'sun', 'refill liters',
       'refill gas'],
      dtype='object')

In [13]:
data.rename(columns={'refill liters': 'refill_liters', 'refill gas': 'refill_gas'}, inplace=True)

## Checking for NaN values

In [14]:
for col in data.columns:
    print(col + ':', data[col].isna().sum(), 'nas\n', round((data[col].isna().sum()*100)/data.shape[0],2), '% of all values\n')

distance: 0 nas
 0.0 % of all values

consume: 0 nas
 0.0 % of all values

speed: 0 nas
 0.0 % of all values

temp_inside: 12 nas
 3.09 % of all values

temp_outside: 0 nas
 0.0 % of all values

specials: 295 nas
 76.03 % of all values

gas_type: 0 nas
 0.0 % of all values

AC: 0 nas
 0.0 % of all values

rain: 0 nas
 0.0 % of all values

sun: 0 nas
 0.0 % of all values

refill_liters: 375 nas
 96.65 % of all values

refill_gas: 375 nas
 96.65 % of all values



### specials, refill_liters and refill_gas have a considerable percentage of nas, so we will have to deal with them.

In [15]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 388 entries, 0 to 387
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   distance       388 non-null    float64
 1   consume        388 non-null    float64
 2   speed          388 non-null    int64  
 3   temp_inside    376 non-null    float64
 4   temp_outside   388 non-null    int64  
 5   specials       93 non-null     object 
 6   gas_type       388 non-null    object 
 7   AC             388 non-null    int64  
 8   rain           388 non-null    int64  
 9   sun            388 non-null    int64  
 10  refill_liters  13 non-null     float64
 11  refill_gas     13 non-null     object 
dtypes: float64(4), int64(5), object(3)
memory usage: 36.5+ KB


### - specials

In [16]:
data['specials'].unique()

array([nan, 'AC rain', 'AC', 'rain', 'snow', 'AC snow',
       'half rain half sun', 'sun', 'AC sun', 'sun ac', 'ac', 'AC Sun',
       'ac rain'], dtype=object)

Looks like it refers to weather conditions. We will fill out the missing values with 'other' and reduce the cardinality.

In [17]:
data['specials'].fillna('other', inplace=True)

In [18]:
data['specials'].replace(to_replace='ac', value='AC', inplace=True)
data['specials'].replace(to_replace='AC Sun', value='AC sun', inplace=True)
data['specials'].replace(to_replace='sun ac', value='AC sun', inplace=True)
data['specials'].replace(to_replace='ac rain', value='AC rain', inplace=True)
data['specials'].replace(to_replace='half rain half sun', value='rain+sun', inplace=True)

In [19]:
data['specials'].unique()

array(['other', 'AC rain', 'AC', 'rain', 'snow', 'AC snow', 'rain+sun',
       'sun', 'AC sun'], dtype=object)

### - temp_inside

In [20]:
data['temp_inside'].unique()

array([21.5, 22.5, 20. ,  nan, 21. , 20.5, 23. , 23.5, 25. , 24. , 22. ,
       19. , 24.5, 25.5])

Low nas value for temp_inside so we will fill it out with KNN inputer.

In [21]:
imputer = KNNImputer(n_neighbors=5)
data['temp_inside'] = imputer.fit_transform(data[['temp_inside']])
data['temp_inside'] = round(data['temp_inside'],2)

In [22]:
data['temp_inside'].unique()

array([21.5 , 22.5 , 20.  , 21.93, 21.  , 20.5 , 23.  , 23.5 , 25.  ,
       24.  , 22.  , 19.  , 24.5 , 25.5 ])

### - refill

In [23]:
data['refill_liters'].unique()

array([45. ,  nan, 37.6, 37.7, 38. , 38.3, 10. , 39. , 41. , 37. , 37.2])

In [24]:
data['refill_gas'].unique()

array(['E10', nan, 'SP98'], dtype=object)

Looks like the amount of fuel liters used to refill the tank with and which fuel was used, so if there is no data it can be due to no refill. We will fill out with 0 and 'no' respectively.

In [25]:
data['refill_liters'].fillna(0, inplace = True)
data['refill_gas'].fillna('no', inplace = True)

In [26]:
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill_liters,refill_gas
0,28.0,5.0,26,21.5,12,other,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,other,E10,0,0,0,0.0,no
2,11.2,5.5,38,21.5,15,other,E10,0,0,0,0.0,no
3,12.9,3.9,36,21.5,14,other,E10,0,0,0,0.0,no
4,18.5,4.5,46,21.5,15,other,E10,0,0,0,0.0,no


In [27]:
data.describe()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,refill_liters
count,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0,388.0
mean,19.652835,4.912371,41.927835,21.929536,11.358247,0.07732,0.123711,0.082474,1.243557
std,22.667837,1.033172,13.598524,0.994666,6.991542,0.267443,0.329677,0.275441,6.856419
min,1.3,3.3,14.0,19.0,-5.0,0.0,0.0,0.0,0.0
25%,11.8,4.3,32.75,21.5,7.0,0.0,0.0,0.0,0.0
50%,14.6,4.7,40.5,22.0,10.0,0.0,0.0,0.0,0.0
75%,19.0,5.3,50.0,22.5,16.0,0.0,0.0,0.0,0.0
max,216.1,12.2,90.0,25.5,31.0,1.0,1.0,1.0,45.0


In [28]:
for col in data.columns:
    print(col + ':', data[col].isna().sum(), 'nas\n', round((data[col].isna().sum()*100)/data.shape[0],2), '% of all values\n')

distance: 0 nas
 0.0 % of all values

consume: 0 nas
 0.0 % of all values

speed: 0 nas
 0.0 % of all values

temp_inside: 0 nas
 0.0 % of all values

temp_outside: 0 nas
 0.0 % of all values

specials: 0 nas
 0.0 % of all values

gas_type: 0 nas
 0.0 % of all values

AC: 0 nas
 0.0 % of all values

rain: 0 nas
 0.0 % of all values

sun: 0 nas
 0.0 % of all values

refill_liters: 0 nas
 0.0 % of all values

refill_gas: 0 nas
 0.0 % of all values



# This concludes the data cleaning. For exploration please refer to the **Exploration notebook**

In [29]:
data.to_csv(config['data']+'data_cleaned.csv', index=False)