# Clean dataset

In [1]:
# Imports

import pandas as pd

from sklearn.linear_model import LinearRegression

In [2]:
# Read data

data = pd.read_excel('data/measurements2.xlsx')

In [3]:
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28.0,5.0,26,21.5,12,,E10,0,0,0,45.0,E10
1,12.0,4.2,30,21.5,13,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15,,E10,0,0,0,,


In [4]:
data.isna().sum()

distance           0
consume            0
speed              0
temp_inside       12
temp_outside       0
specials         295
gas_type           0
AC                 0
rain               0
sun                0
refill liters    375
refill gas       375
dtype: int64

In [5]:
# Drop columns 'refill liters' and 'refill gas' because just too much data is missing.

data.drop(['refill liters', 'refill gas'], axis = 1, inplace = True)

In [6]:
# Drop column 'specials' as its information is already one-hot-encoded.

data[data['specials'].notna()][['specials', 'AC', 'rain', 'sun']]

data.drop('specials', axis = 1, inplace = True)

In [7]:
data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,gas_type,AC,rain,sun
0,28.0,5.0,26,21.5,12,E10,0,0,0
1,12.0,4.2,30,21.5,13,E10,0,0,0
2,11.2,5.5,38,21.5,15,E10,0,0,0
3,12.9,3.9,36,21.5,14,E10,0,0,0
4,18.5,4.5,46,21.5,15,E10,0,0,0


In [8]:
# Checking 'gas_type'

data['gas_type'].value_counts()

SP98    228
E10     160
Name: gas_type, dtype: int64

In [9]:
# One-hot-encode 'gas_type' as there are only two different values

gas_type_one_hot = pd.get_dummies(data['gas_type'], drop_first = True)

data = pd.concat([data, gas_type_one_hot], axis = 1)

data.drop('gas_type', axis = 1, inplace = True)

data.head()

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,AC,rain,sun,SP98
0,28.0,5.0,26,21.5,12,0,0,0,0
1,12.0,4.2,30,21.5,13,0,0,0,0
2,11.2,5.5,38,21.5,15,0,0,0,0
3,12.9,3.9,36,21.5,14,0,0,0,0
4,18.5,4.5,46,21.5,15,0,0,0,0


In [10]:
data.isna().sum()

distance         0
consume          0
speed            0
temp_inside     12
temp_outside     0
AC               0
rain             0
sun              0
SP98             0
dtype: int64

In [11]:
data['temp_inside'].describe()

count    376.000000
mean      21.929521
std        1.010455
min       19.000000
25%       21.500000
50%       22.000000
75%       22.500000
max       25.500000
Name: temp_inside, dtype: float64

In [12]:
# Fill the 12 missing temp values using linear regression

train = data[data['temp_inside'].notnull()]
test = data[data['temp_inside'].isnull()]

X_train = train.drop('temp_inside', axis = 1)
y_train = train['temp_inside']
X_test = test.drop('temp_inside', axis = 1)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

test = test.copy()
test['temp_inside'] = y_pred

data = pd.concat([train, test])

In [13]:
data.isna().sum()

distance        0
consume         0
speed           0
temp_inside     0
temp_outside    0
AC              0
rain            0
sun             0
SP98            0
dtype: int64