# Check values before feature selection in both training and test data

- nan
- different enough values

In [19]:
import pandas as pd
import glob
import os

training_df = pd.concat(map(pd.read_csv, glob.glob(os.path.join('../Data/Train', "*.csv"))), ignore_index=True)
test_df = test_data = pd.read_csv('../data/test.csv')

In [23]:
training_df_shape = training_df.shape
test_df_shape = test_df.shape


def nan_analysis(column_name):
    training_with_null_df = training_df[training_df[column_name].isnull()]
    training_nan = training_with_null_df.shape
    print(f'Number of Nan for {column_name}: {training_nan} of {training_df_shape}')
    test_nan = test_df[test_df[column_name].isnull()].shape
    print(f'Number of Nan for {column_name}: {test_nan} of {test_df_shape}')
    return training_with_null_df[['station']]


def value_analysis(column_name):
    return pd.merge(training_df[[column_name]].describe(),
                    test_df[[column_name]].describe(),
                    left_index=True,
                    right_index=True,
                    suffixes=('training', 'test'))


# Weather Data

In [24]:
precipitation = 'precipitation.l.m2'
precipitation_nan = nan_analysis(precipitation)
value_analysis(precipitation)

# -> Training data has no values for precipitation not a good feature

Number of Nan for precipitation.l.m2: (75, 25) of (55875, 25)
Number of Nan for precipitation.l.m2: (0, 25) of (2250, 25)


Unnamed: 0,precipitation.l.m2training,precipitation.l.m2test
count,55800.0,2250.0
mean,0.0,0.008622
std,0.0,0.092475
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,0.0,0.0
max,0.0,2.6


In [25]:
column = 'temperature.C'
temperature_nan = nan_analysis(column)
value_analysis(column)

# min temperature is quite different between training and test but there seems to be enough data

Number of Nan for temperature.C: (75, 25) of (55875, 25)
Number of Nan for temperature.C: (0, 25) of (2250, 25)


Unnamed: 0,temperature.Ctraining,temperature.Ctest
count,55800.0,2250.0
mean,21.707796,12.926578
std,3.271253,4.084343
min,13.9,3.1
25%,19.4,9.9
50%,21.35,12.9
75%,23.4,15.975
max,34.6,25.0


In [26]:
column = 'windMaxSpeed.m.s'
windmax_nan = nan_analysis(column)
value_analysis(column)

Number of Nan for windMaxSpeed.m.s: (75, 25) of (55875, 25)
Number of Nan for windMaxSpeed.m.s: (0, 25) of (2250, 25)


Unnamed: 0,windMaxSpeed.m.straining,windMaxSpeed.m.stest
count,55800.0,2250.0
mean,11.406855,11.377333
std,8.664456,9.22172
min,0.0,0.0
25%,4.8,4.8
50%,9.7,8.0
75%,16.1,14.5
max,43.5,62.8


In [27]:
column = 'windMeanSpeed.m.s'
windmean_nan = nan_analysis(column)
value_analysis(column)

Number of Nan for windMeanSpeed.m.s: (75, 25) of (55875, 25)
Number of Nan for windMeanSpeed.m.s: (0, 25) of (2250, 25)


Unnamed: 0,windMeanSpeed.m.straining,windMeanSpeed.m.stest
count,55800.0,2250.0
mean,4.694489,4.079911
std,4.583093,4.40474
min,0.0,0.0
25%,1.6,1.6
50%,3.2,3.2
75%,6.4,6.4
max,27.4,25.7


In [28]:
column = 'windDirection.grades'
winddir_nan = nan_analysis(column)
value_analysis(column)

Number of Nan for windDirection.grades: (375, 25) of (55875, 25)
Number of Nan for windDirection.grades: (0, 25) of (2250, 25)


Unnamed: 0,windDirection.gradestraining,windDirection.gradestest
count,55500.0,2250.0
mean,170.227162,181.7168
std,86.912565,67.840312
min,0.0,0.0
25%,135.0,157.5
50%,180.0,180.0
75%,202.5,225.0
max,337.5,337.5


In [29]:
column = 'relHumidity.HR'
relhum_nan = nan_analysis(column)
value_analysis(column)

Number of Nan for relHumidity.HR: (75, 25) of (55875, 25)
Number of Nan for relHumidity.HR: (0, 25) of (2250, 25)


Unnamed: 0,relHumidity.HRtraining,relHumidity.HRtest
count,55800.0,2250.0
mean,65.943548,64.204889
std,16.724256,17.402368
min,18.0,17.0
25%,54.0,50.0
50%,68.0,65.0
75%,81.0,79.0
max,91.0,97.0


In [30]:
column = 'airPressure.mb'
airpressure_nan = nan_analysis(column)
value_analysis(column)

Number of Nan for airPressure.mb: (75, 25) of (55875, 25)
Number of Nan for airPressure.mb: (0, 25) of (2250, 25)


Unnamed: 0,airPressure.mbtraining,airPressure.mbtest
count,55800.0,2250.0
mean,1002.257258,1010.362089
std,42.523692,17.065828
min,811.9,896.9
25%,1009.675,1005.7
50%,1015.35,1013.55
75%,1021.5,1021.5
max,1084.9,1033.6


In [38]:
# all weather measure are missing 75
set(airpressure_nan.index) - set(relhum_nan.index)


set()

In [40]:
set(winddir_nan.index) - set(relhum_nan.index)

{193,
 218,
 222,
 334,
 938,
 963,
 967,
 1079,
 1683,
 1708,
 1712,
 1824,
 2428,
 2453,
 2457,
 2569,
 3173,
 3198,
 3202,
 3314,
 3918,
 3943,
 3947,
 4059,
 4663,
 4688,
 4692,
 4804,
 5408,
 5433,
 5437,
 5549,
 6153,
 6178,
 6182,
 6294,
 6898,
 6923,
 6927,
 7039,
 7643,
 7668,
 7672,
 7784,
 8388,
 8413,
 8417,
 8529,
 9133,
 9158,
 9162,
 9274,
 9878,
 9903,
 9907,
 10019,
 10623,
 10648,
 10652,
 10764,
 11368,
 11393,
 11397,
 11509,
 12113,
 12138,
 12142,
 12254,
 12858,
 12883,
 12887,
 12999,
 13603,
 13628,
 13632,
 13744,
 14348,
 14373,
 14377,
 14489,
 15093,
 15118,
 15122,
 15234,
 15838,
 15863,
 15867,
 15979,
 16583,
 16608,
 16612,
 16724,
 17328,
 17353,
 17357,
 17469,
 18073,
 18098,
 18102,
 18214,
 18818,
 18843,
 18847,
 18959,
 19563,
 19588,
 19592,
 19704,
 20308,
 20333,
 20337,
 20449,
 21053,
 21078,
 21082,
 21194,
 21798,
 21823,
 21827,
 21939,
 22543,
 22568,
 22572,
 22684,
 23288,
 23313,
 23317,
 23429,
 24033,
 24058,
 24062,
 24174,
 24778