In [1]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns


In [2]:
#import dataset

cobify = pd.read_csv('measurements.csv')
cobify

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,AC,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16,37,39,245,18,,SP98,0,0,0,,
384,161,43,38,25,31,AC,SP98,1,0,0,,
385,16,38,45,25,19,,SP98,0,0,0,,
386,154,46,42,25,31,AC,SP98,1,0,0,,


### First off, looking at this dataset, I'd ask for the metadata which explains the meaning of each feature. 

-It's a bit vague here, would be nice to have descriptions of each feature

#### What is each row?

-From kaggle, heading of dataset is: "Which of two fuels is cheaper, E10 or SP 98?"

-I challenge you to predict the consumption depending on the gas type!


### More info from Kaggle

In the file, you will find the displayed distance (km); the consume (L/100km); the average speed (km/h), the temperature I had inside (°C), the temperature outside (°C), anything special that happened, if it was raining, if the air condition was on, if it was sunny enough that the car felt warm when i started it… and the gas type I was using. 

I have also two columns saying how much and which gas type I was buying. Careful with those. The numbers don't add exactly up, because I note only the rides that occur under certain conditions: If the car was not cooling down enough to have another independent measure from the one before, i don't note it.

I started writing down the data in November, changed to SP98 in winter, and back to E10 in spring. Apart from that, the data is rather clean as I was doing my own project on it already.

In [6]:
#standardize heading names to snake case
#first, change titles to lower case  (just AC here)
lower = []
for i in cobify.columns:
    lower.append(i.lower())
cobify.columns = lower

cobify

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,ac,rain,sun,refill liters,refill gas
0,28,5,26,215,12,,E10,0,0,0,45,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16,37,39,245,18,,SP98,0,0,0,,
384,161,43,38,25,31,AC,SP98,1,0,0,,
385,16,38,45,25,19,,SP98,0,0,0,,
386,154,46,42,25,31,AC,SP98,1,0,0,,


In [7]:
#Puts underscore instead of spaces
under_ = []
for i in cobify.columns:
    if ' ' in i:
        under_.append(i.replace(' ', '_'))
    else:
        under_.append(i)
cobify.columns = under_   

cobify

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,ac,rain,sun,refill_liters,refill_gas
0,28,5,26,215,12,,E10,0,0,0,45,E10
1,12,42,30,215,13,,E10,0,0,0,,
2,112,55,38,215,15,,E10,0,0,0,,
3,129,39,36,215,14,,E10,0,0,0,,
4,185,45,46,215,15,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16,37,39,245,18,,SP98,0,0,0,,
384,161,43,38,25,31,AC,SP98,1,0,0,,
385,16,38,45,25,19,,SP98,0,0,0,,
386,154,46,42,25,31,AC,SP98,1,0,0,,


In [None]:
#converting commas to periods in numerical columns

cobify['distance'] = cobify['distance'].apply(lambda x: x.replace(',', '.')).astype('float')
cobify['consume'] = cobify['consume'].apply(lambda x: x.replace(',', '.')).astype('float')

In [31]:
cobify["temp_inside"] = cobify["temp_inside"].str.replace(',','.').astype('float')


In [34]:
#temp_outside is an integer, change it into a string first
cobify['temp_outside']= cobify['temp_outside'].astype(str) 
cobify["temp_outside"] = cobify["temp_outside"].str.replace(',','.').astype('float')

In [36]:
cobify.dtypes

distance         float64
consume          float64
speed              int64
temp_inside      float64
temp_outside     float64
specials          object
gas_type          object
ac                 int64
rain               int64
sun                int64
refill_liters     object
refill_gas        object
dtype: object

In [None]:
cobify = cobify.astype({'distance':'float','consume':'float', 'temp_inside':'float', 'refill_liters':'float' })
cobify.dtypes

In [37]:
cobify

Unnamed: 0,distance,consume,speed,temp_inside,temp_outside,specials,gas_type,ac,rain,sun,refill_liters,refill_gas
0,28.0,5.0,26,21.5,12.0,,E10,0,0,0,45,E10
1,12.0,4.2,30,21.5,13.0,,E10,0,0,0,,
2,11.2,5.5,38,21.5,15.0,,E10,0,0,0,,
3,12.9,3.9,36,21.5,14.0,,E10,0,0,0,,
4,18.5,4.5,46,21.5,15.0,,E10,0,0,0,,
...,...,...,...,...,...,...,...,...,...,...,...,...
383,16.0,3.7,39,24.5,18.0,,SP98,0,0,0,,
384,16.1,4.3,38,25.0,31.0,AC,SP98,1,0,0,,
385,16.0,3.8,45,25.0,19.0,,SP98,0,0,0,,
386,15.4,4.6,42,25.0,31.0,AC,SP98,1,0,0,,


In [None]:
#next let's check NaN and unique values and duplicates

In [38]:
cobify.nunique() 

distance         174
consume           43
speed             60
temp_inside       13
temp_outside      33
specials          12
gas_type           2
ac                 2
rain               2
sun                2
refill_liters     10
refill_gas         2
dtype: int64

In [62]:
cobify['consume']. value_counts(dropna=False)

5.0     27
4.5     27
4.6     22
4.7     21
4.3     20
4.1     20
5.1     19
4.8     19
4.4     17
4.0     17
4.9     16
4.2     15
5.3     14
3.9     14
5.2     12
3.8     11
5.7     11
5.6     10
5.4      9
3.7      8
5.8      7
5.5      6
5.9      5
3.6      4
6.4      4
6.1      4
6.2      4
6.0      4
6.3      4
8.1      2
6.5      2
7.4      2
6.9      1
8.7      1
12.2     1
7.9      1
10.8     1
9.9      1
7.1      1
3.3      1
9.0      1
11.5     1
6.6      1
Name: consume, dtype: int64

In [61]:
cobify['speed']. value_counts(dropna=False)

42    18
43    17
38    16
26    15
36    15
33    15
40    15
32    12
44    12
37    12
29    11
39    11
55    11
46    11
45    10
58    10
25    10
24     9
50     9
35     9
57     8
41     8
52     7
53     6
48     6
28     6
30     6
21     6
61     6
51     6
47     5
56     5
22     5
34     4
49     4
27     4
62     4
59     3
60     3
54     3
75     3
23     3
31     3
80     3
69     2
16     2
18     2
71     2
20     2
82     2
63     2
14     1
73     1
66     1
85     1
67     1
88     1
65     1
90     1
87     1
Name: speed, dtype: int64

In [60]:
cobify['temp_inside']. value_counts(dropna=False)

21.5    133
22.0    102
22.5     59
20.0     25
21.0     13
23.0     13
NaN      12
25.0     12
24.5      7
20.5      4
24.0      3
23.5      2
25.5      2
19.0      1
Name: temp_inside, dtype: int64

In [59]:
cobify['temp_outside']. value_counts(dropna=False)

 8.0     31
 10.0    31
 7.0     28
 6.0     21
 9.0     20
 11.0    20
 18.0    19
 12.0    17
 4.0     17
 17.0    16
 16.0    16
 15.0    14
 13.0    14
 14.0    13
 3.0     13
 5.0     13
 1.0     12
 0.0      9
 21.0     9
 2.0      7
 23.0     7
 24.0     7
 19.0     5
 27.0     5
 26.0     4
 31.0     3
 25.0     3
 20.0     3
 30.0     3
 22.0     3
-3.0      2
 28.0     2
-5.0      1
Name: temp_outside, dtype: int64

In [58]:
cobify['specials']. value_counts(dropna=False)   
#this column needs to be cleaned up: lots of NaNs, repeated values

NaN                   295
rain                   32
sun                    27
AC rain                 9
ac                      8
AC                      6
snow                    3
sun ac                  3
AC snow                 1
half rain half sun      1
AC sun                  1
AC Sun                  1
ac rain                 1
Name: specials, dtype: int64

In [57]:
cobify['gas_type']. value_counts(dropna=False)  #only 2 gas types 

SP98    228
E10     160
Name: gas_type, dtype: int64

In [56]:
cobify['ac']. value_counts(dropna=False)   #0 means No A/C used, 1 means yes A/C used, I'm assuming

0    358
1     30
Name: ac, dtype: int64

In [55]:
cobify['rain']. value_counts(dropna=False) #0 means No rain, 1 means yes rain, I'm assuming

0    340
1     48
Name: rain, dtype: int64

In [54]:
cobify['sun']. value_counts(dropna=False) #0 means No sun, 1 means yes sun, I'm assuming

0    356
1     32
Name: sun, dtype: int64

In [53]:
cobify['refill_liters'].value_counts(dropna=False)
#need to fix the commas here! missed it above 
#majority of columns is NaNs

NaN     375
45        2
37,7      2
39        2
37,6      1
38        1
38,3      1
10        1
41        1
37        1
37,2      1
Name: refill_liters, dtype: int64

In [63]:
percent_missing = cobify.isnull().sum() * 100 / len(cobify)
percent_missing

distance          0.000000
consume           0.000000
speed             0.000000
temp_inside       3.092784
temp_outside      0.000000
specials         76.030928
gas_type          0.000000
ac                0.000000
rain              0.000000
sun               0.000000
refill_liters    96.649485
refill_gas       96.649485
dtype: float64

In [None]:
#next steps-- re-read the challenge, see if there's an explanation for these NaNs, otherwise, decide what to do with them