In [1]:
import pandas as pd
import numpy as np


In [2]:
# Load dataset into notebook
data = pd.read_csv('../data/measurements.csv')
data

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16,37,39,245,18,,SP98,0,0,0,,
384,161,43,38,25,31,AC,SP98,1,0,0,,
385,16,38,45,25,19,,SP98,0,0,0,,
386,154,46,42,25,31,AC,SP98,1,0,0,,


In [3]:
# Standardise column names
data.rename(columns={'consume':'consumption', 'AC':'ac', 'refill liters': 'refill_litres', 'refill gas': 'refill_gas'}, inplace=True)

In [4]:
# Check for missing values
data.isna().sum()

distance           0
consumption        0
speed              0
temp_inside       12
temp_outside       0
specials         295
gas_type           0
ac                 0
rain               0
sun                0
refill_litres    375
refill_gas       375
dtype: int64

In [5]:
# Drop columns specials, refill_litres and refill_gas as there are too many missing values
data.drop(columns=['specials', 'refill_litres', 'refill_gas'],inplace=True)

In [6]:
# Check data types of columns
data.dtypes

distance        object
consumption     object
speed            int64
temp_inside     object
temp_outside     int64
gas_type        object
ac               int64
rain             int64
sun              int64
dtype: object

In [7]:
# Replace commas to decimal points
data['distance']=data['distance'].apply(lambda x: x.replace(',','.'))
data['consumption']=data['consumption'].apply(lambda x: x.replace(',','.'))
data['temp_inside']=data['temp_inside'].str.replace(',','.')

# Change data types to float
data["distance"]=data["distance"].astype(np.float64)
data["consumption"]=data["consumption"].astype(np.float64)
data["temp_inside"]=data["temp_inside"].astype(np.float64)

In [15]:
# Fill in missing values of temp_inside
print(data['temp_inside'].mean())
print(data['temp_inside'].median())
# Use the median 

21.929521276595743
22.0


In [16]:
data['temp_inside'].fillna(data['temp_inside'].median(),inplace=True)

In [19]:
data.isna().sum()

distance        0
consumption     0
speed           0
temp_inside     0
temp_outside    0
gas_type        0
ac              0
rain            0
sun             0
dtype: int64

In [20]:
data.dtypes

distance        float64
consumption     float64
speed             int64
temp_inside     float64
temp_outside      int64
gas_type         object
ac                int64
rain              int64
sun               int64
dtype: object

In [21]:
data

Unnamed: 0,distance,consumption,speed,temp_inside,temp_outside,gas_type,ac,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0
...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18,SP98,0,0,0
384,16.1,4.3,38,25.0,31,SP98,1,0,0
385,16.0,3.8,45,25.0,19,SP98,0,0,0
386,15.4,4.6,42,25.0,31,SP98,1,0,0


In [24]:
# export clean data to a new csv
data.to_csv('../data/cleaned_data.csv',index=False)