In [183]:
import sys
import os
import importlib
import re
import math
import numpy as np
import pandas as pd
import plotly.express as px
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = 'all'

project_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(os.path.join(project_dir, 'src', 'utilities'))
import utilities as utils

In [184]:
# Import if using in a the notebook within a browser
# from IPython.core.display import HTML
# display(HTML("<style>.container { width:90% !important; }</style>"))

In [185]:
# Run if changes are made to the datasets_config file or utilities
importlib.reload(utils)

<module 'utilities' from 'c:\\Users\\Jonah\\Development\\binary-rainfall-time-series-ml\\src\\utilities\\utilities.py'>

In [186]:
io_dir = os.path.join(project_dir, 'data')
raw_dir = os.path.join(io_dir, 'raw')
processed_dir = os.path.join(io_dir, 'processed')

In [187]:
raw_train_df = pd.read_csv(os.path.join(raw_dir, 'train.csv'), index_col=0)
raw_test_df = pd.read_csv(os.path.join(raw_dir, 'test.csv'), index_col=0)
raw_sample_submission_df = pd.read_csv(os.path.join(raw_dir, 'sample_submission.csv'), index_col=0)

In [188]:
raw_train_df.shape
raw_test_df.shape
raw_sample_submission_df.shape
raw_train_df.head()
raw_test_df.head()
raw_sample_submission_df.head()

(2190, 12)

(730, 11)

(730, 1)

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


Unnamed: 0_level_0,rainfall
id,Unnamed: 1_level_1
2190,0
2191,0
2192,0
2193,0
2194,0


### Let's check if the days are consecutive and fall within 365 years, and if any leap years are present.

In [189]:
combined_df = pd.concat(
    [raw_train_df, raw_test_df.merge(raw_sample_submission_df, left_index=True, right_index=True)],
    axis=0)
combined_df['source_data'] = combined_df.apply(lambda row: 'Train' if row.name in raw_train_df.index else 'Test', axis=1)
combined_df.index.is_unique
raw_train_df.shape
raw_test_df.shape
raw_sample_submission_df.shape
combined_df.loc[combined_df['source_data'] == 'Train'].shape
combined_df.loc[combined_df['source_data'] == 'Test'].shape
combined_df.head()
combined_df.tail()

True

(2190, 12)

(730, 11)

(730, 1)

(2190, 13)

(730, 13)

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1,Train
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1,Train
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1,Train
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1,Train
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0,Train


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2915,361,1020.8,18.2,17.6,16.1,13.7,96.0,95.0,0.0,20.0,34.3,0,Test
2916,362,1011.7,23.2,18.1,16.0,16.0,78.0,80.0,1.6,40.0,25.2,0,Test
2917,363,1022.7,21.0,18.5,17.0,15.5,92.0,96.0,0.0,50.0,21.9,0,Test
2918,364,1014.4,21.0,20.0,19.7,19.8,94.0,93.0,0.0,50.0,39.5,0,Test
2919,365,1020.9,22.2,18.8,17.0,13.3,79.0,89.0,0.2,60.0,50.6,0,Test


In [190]:
combined_df.groupby(['day'])[['rainfall']].count()
combined_df.groupby(['day'])[['rainfall']].count().max()
combined_df.groupby(['day'])[['rainfall']].count().min()

Unnamed: 0_level_0,rainfall
day,Unnamed: 1_level_1
1,8
2,8
3,9
4,9
5,9
...,...
361,8
362,8
363,7
364,7


rainfall    9
dtype: int64

rainfall    7
dtype: int64

In [191]:
# add 1 since the index starts at 0
combined_df['expected_day'] = (combined_df.index) % 365 + 1
combined_df['mislabeled_day'] = combined_df.apply(lambda x: True if x['day'] != x['expected_day'] else False, axis=1)
combined_df['elapsed_days'] = np.arange(len(combined_df.index))
# add 1 since elasped_days starts at 0
combined_df['year'] = combined_df['elapsed_days'].apply(lambda x: math.ceil((x + 1) / 365))

combined_df.loc[combined_df['source_data'] == 'Train']['year'].unique()
combined_df.loc[combined_df['source_data'] == 'Train']['mislabeled_day'].unique()
combined_df.loc[combined_df['source_data'] == 'Test']['year'].unique()
combined_df.loc[combined_df['source_data'] == 'Test']['mislabeled_day'].unique()

array([1, 2, 3, 4, 5, 6])

array([False,  True])

array([7, 8])

array([False])

In [192]:
non_consecutive_day_indexes = combined_df[combined_df['mislabeled_day']].index.to_list()
window_distance = 2
potential_corrections = []
original_and_corrected_days = {}
for i in non_consecutive_day_indexes:
    window = [j for j in range(i-window_distance, i+window_distance+1)]
    combined_df.reindex(window)
    day = combined_df.at[i, 'day']
    expected_day = combined_df.at[i, 'expected_day']
    potential_corrections.append(expected_day)
    if day not in original_and_corrected_days:
        original_and_corrected_days[day] = [expected_day]
    else:
        original_and_corrected_days[day].append(expected_day)

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1035,306,1016.9,27.5,24.5,23.5,22.4,82.0,88.0,4.5,50.0,24.9,1,Train,306,False,1035,3
1036,307,1014.6,25.2,24.8,23.9,21.2,84.0,70.0,2.1,80.0,33.5,1,Train,307,False,1036,3
1037,3,1015.2,25.0,23.9,23.6,19.9,76.0,61.0,8.8,90.0,14.1,0,Train,308,True,1037,3
1038,309,1017.0,27.7,24.0,22.6,19.6,91.0,78.0,3.3,25.0,16.4,1,Train,309,False,1038,3
1039,310,1018.4,26.4,24.4,23.3,23.3,86.0,66.0,9.7,60.0,8.3,1,Train,310,False,1039,3


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1130,36,1017.0,22.4,20.2,18.5,13.1,76.0,69.0,7.6,40.0,11.3,0,Train,36,False,1130,4
1131,37,1017.5,23.1,19.9,17.4,16.8,80.0,88.0,0.0,50.0,39.5,1,Train,37,False,1131,4
1132,265,1020.6,18.4,17.1,16.9,15.0,75.0,86.0,0.0,40.0,24.9,1,Train,38,True,1132,4
1133,39,1014.9,23.3,21.6,20.9,20.9,87.0,88.0,0.0,60.0,15.1,1,Train,39,False,1133,4
1134,40,1014.0,23.0,21.1,20.6,19.1,80.0,87.0,0.1,70.0,14.5,1,Train,40,False,1134,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1208,114,1013.0,28.8,25.6,23.6,21.6,91.0,90.0,1.3,80.0,24.9,0,Train,114,False,1208,4
1209,115,1012.3,30.8,27.4,26.8,24.5,84.0,81.0,2.3,100.0,14.3,1,Train,115,False,1209,4
1210,80,1011.5,29.8,26.8,25.1,23.2,87.0,81.0,2.5,20.0,12.7,1,Train,116,True,1210,4
1211,117,1011.6,28.4,26.1,24.8,25.3,85.0,83.0,3.7,60.0,14.3,1,Train,117,False,1211,4
1212,118,1013.0,28.6,25.5,24.8,22.2,76.0,74.0,6.0,60.0,9.2,1,Train,118,False,1212,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1249,155,1008.2,34.0,30.2,28.2,26.5,81.0,79.0,6.8,290.0,43.8,1,Train,155,False,1249,4
1250,156,1008.1,34.0,30.8,28.7,25.2,74.0,22.0,11.8,240.0,12.5,0,Train,156,False,1250,4
1251,236,1005.0,35.3,31.0,26.4,26.4,74.0,49.0,10.8,230.0,28.6,1,Train,157,True,1251,4
1252,158,1008.9,33.4,30.4,28.6,25.7,82.0,72.0,7.7,60.0,40.8,0,Train,158,False,1252,4
1253,159,1005.3,32.0,29.2,26.2,26.3,87.0,85.0,1.5,220.0,4.5,1,Train,159,False,1253,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1282,188,1008.3,33.2,30.0,28.8,25.2,81.0,77.0,7.8,220.0,22.5,1,Train,188,False,1282,4
1283,189,1009.5,33.3,30.0,27.8,25.8,79.0,43.0,9.5,220.0,11.2,0,Train,189,False,1283,4
1284,4,1007.9,33.0,30.1,27.8,25.1,78.0,67.0,5.7,220.0,12.5,1,Train,190,True,1284,4
1285,191,1005.7,31.0,29.2,27.8,26.2,83.0,83.0,4.7,60.0,14.8,1,Train,191,False,1285,4
1286,192,1008.0,33.0,30.2,28.4,25.2,78.0,69.0,10.8,220.0,12.0,0,Train,192,False,1286,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1288,194,1006.1,32.3,30.0,28.1,24.7,79.0,57.0,6.1,210.0,20.2,0,Train,194,False,1288,4
1289,195,1007.9,32.3,29.1,28.9,26.4,79.0,78.0,8.2,270.0,25.1,1,Train,195,False,1289,4
1290,16,1008.4,32.3,29.0,26.5,24.4,79.0,40.0,11.2,220.0,12.4,0,Train,196,True,1290,4
1291,197,1008.4,34.4,30.1,28.5,25.6,72.0,41.0,10.4,20.0,4.4,0,Train,197,False,1291,4
1292,198,1008.1,33.1,30.1,28.1,25.8,75.0,46.0,10.2,240.0,4.5,0,Train,198,False,1292,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1310,216,1008.1,31.7,29.1,26.5,26.0,82.0,85.0,0.4,230.0,25.9,1,Train,216,False,1310,4
1311,217,1008.3,29.1,27.9,25.8,23.2,72.0,69.0,7.1,70.0,25.9,1,Train,217,False,1311,4
1312,5,1005.7,31.7,28.5,27.3,24.3,76.0,49.0,7.2,240.0,15.1,1,Train,218,True,1312,4
1313,219,1008.1,32.0,28.1,26.8,25.4,84.0,78.0,4.6,220.0,15.7,1,Train,219,False,1313,4
1314,220,1001.4,32.7,29.7,27.3,25.1,78.0,67.0,5.4,220.0,11.7,1,Train,220,False,1314,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1316,222,1005.7,32.4,29.4,28.2,26.6,81.0,78.0,6.0,220.0,20.5,1,Train,222,False,1316,4
1317,223,1008.5,30.3,27.6,25.1,26.4,89.0,80.0,4.0,70.0,10.3,1,Train,223,False,1317,4
1318,6,1002.5,32.0,28.9,26.4,24.9,84.0,73.0,8.4,20.0,9.5,1,Train,224,True,1318,4
1319,225,1008.9,29.9,26.6,25.9,23.2,85.0,82.0,2.1,120.0,7.9,1,Train,225,False,1319,4
1320,226,1011.5,28.6,26.8,24.5,23.2,80.0,88.0,1.2,60.0,6.6,1,Train,226,False,1320,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1344,250,1010.5,30.8,28.2,26.1,25.2,83.0,81.0,2.1,140.0,25.1,1,Train,250,False,1344,4
1345,251,1011.4,31.7,28.0,26.9,23.7,75.0,54.0,6.0,20.0,9.1,0,Train,251,False,1345,4
1346,151,1008.4,32.8,29.4,27.0,24.3,74.0,27.0,10.6,220.0,12.4,0,Train,252,True,1346,4
1347,253,1010.0,31.5,29.8,27.4,20.4,77.0,60.0,10.2,20.0,20.9,0,Train,253,False,1347,4
1348,254,1013.3,29.4,27.7,25.8,24.5,84.0,77.0,1.8,80.0,34.8,1,Train,254,False,1348,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1350,256,1014.2,30.9,28.1,25.2,25.2,84.0,60.0,11.1,140.0,12.9,0,Train,256,False,1350,4
1351,257,1013.7,31.0,28.1,26.8,24.9,81.0,84.0,1.0,210.0,50.6,1,Train,257,False,1351,4
1352,81,1011.9,31.9,28.0,25.8,25.6,81.0,89.0,1.1,100.0,24.0,1,Train,258,True,1352,4
1353,259,1011.2,29.8,26.4,24.4,23.1,80.0,51.0,6.1,220.0,7.9,1,Train,259,False,1353,4
1354,260,1008.1,28.1,27.9,25.3,22.2,79.0,84.0,2.0,10.0,50.6,1,Train,260,False,1354,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1365,271,1017.0,31.7,28.3,25.3,23.4,84.0,78.0,4.2,100.0,11.7,1,Train,271,False,1365,4
1366,272,1017.0,29.1,27.5,26.0,20.6,65.0,51.0,8.6,80.0,39.3,0,Train,272,False,1366,4
1367,264,1011.2,26.8,24.5,23.2,23.7,77.0,56.0,6.0,80.0,37.7,1,Train,273,True,1367,4
1368,274,1016.8,28.4,24.5,22.3,19.4,74.0,49.0,6.4,10.0,21.9,0,Train,274,False,1368,4
1369,275,1008.8,27.6,25.0,24.5,24.3,62.0,88.0,8.2,230.0,39.5,1,Train,275,False,1369,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1371,277,1016.7,29.2,25.8,23.1,17.8,67.0,17.0,10.2,70.0,15.7,0,Train,277,False,1371,4
1372,278,1011.6,28.2,25.5,22.8,23.6,76.0,47.0,9.4,90.0,19.0,1,Train,278,False,1372,4
1373,74,1011.4,29.1,25.8,23.1,19.3,70.0,32.0,10.6,10.0,9.9,0,Train,279,True,1373,4
1374,280,1010.8,22.7,20.9,19.3,18.0,79.0,88.0,0.6,50.0,18.6,1,Train,280,False,1374,4
1375,281,1011.4,27.5,22.8,22.5,19.4,76.0,85.0,2.5,70.0,32.4,1,Train,281,False,1375,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1378,284,1019.3,27.5,24.5,22.3,16.8,73.0,49.0,9.3,10.0,21.9,0,Train,284,False,1378,4
1379,285,1012.2,29.7,26.9,24.5,23.3,82.0,80.0,3.2,70.0,21.3,1,Train,285,False,1379,4
1380,7,1012.0,30.1,26.8,23.3,20.6,70.0,44.0,10.2,20.0,9.5,0,Train,286,True,1380,4
1381,287,1014.6,25.3,23.7,21.1,21.1,76.0,49.0,6.5,50.0,23.7,0,Train,287,False,1381,4
1382,57,1016.2,25.9,23.7,20.1,19.2,82.0,83.0,3.4,20.0,20.5,1,Train,288,True,1382,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1380,7,1012.0,30.1,26.8,23.3,20.6,70.0,44.0,10.2,20.0,9.5,0,Train,286,True,1380,4
1381,287,1014.6,25.3,23.7,21.1,21.1,76.0,49.0,6.5,50.0,23.7,0,Train,287,False,1381,4
1382,57,1016.2,25.9,23.7,20.1,19.2,82.0,83.0,3.4,20.0,20.5,1,Train,288,True,1382,4
1383,289,1017.1,28.4,25.8,25.5,21.6,82.0,83.0,3.0,80.0,24.4,0,Train,289,False,1383,4
1384,290,1016.0,25.4,23.2,21.5,18.6,70.0,69.0,6.4,50.0,28.8,0,Train,290,False,1384,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1386,292,1014.8,29.2,25.5,23.4,19.9,84.0,70.0,6.3,80.0,13.7,0,Train,292,False,1386,4
1387,293,1016.8,26.2,24.7,22.6,22.8,79.0,84.0,4.6,40.0,21.9,1,Train,293,False,1387,4
1388,75,1014.8,27.1,26.1,24.5,23.1,78.0,61.0,7.6,80.0,13.7,0,Train,294,True,1388,4
1389,295,1016.8,30.7,26.2,24.7,19.9,75.0,68.0,9.2,70.0,16.9,1,Train,295,False,1389,4
1390,296,1015.6,26.3,23.8,21.4,20.9,70.0,72.0,8.5,50.0,35.8,0,Train,296,False,1390,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1393,299,1015.6,28.1,25.5,23.4,21.6,87.0,88.0,0.3,80.0,22.9,1,Train,299,False,1393,4
1394,300,1014.7,29.6,26.5,23.8,19.6,90.0,85.0,0.5,70.0,24.9,1,Train,300,False,1394,4
1395,8,1013.7,26.3,24.4,23.3,19.9,91.0,74.0,3.1,70.0,38.0,0,Train,301,True,1395,4
1396,302,1016.5,29.4,25.8,22.8,16.0,79.0,88.0,0.3,70.0,9.5,0,Train,302,False,1396,4
1397,303,1016.8,26.7,24.8,23.6,19.4,84.0,83.0,0.1,70.0,8.9,1,Train,303,False,1397,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1398,304,1016.5,28.5,26.2,24.4,23.1,79.0,84.0,2.4,80.0,19.0,1,Train,304,False,1398,4
1399,305,1015.2,26.6,24.8,22.3,22.1,87.0,61.0,7.0,40.0,16.4,1,Train,305,False,1399,4
1400,266,1017.1,28.4,25.4,23.7,22.1,79.0,83.0,3.5,80.0,41.3,1,Train,306,True,1400,4
1401,307,1014.0,27.4,25.8,24.8,22.9,81.0,83.0,7.6,80.0,21.8,1,Train,307,False,1401,4
1402,308,1016.7,27.5,25.0,23.1,22.3,76.0,49.0,9.5,40.0,8.5,1,Train,308,False,1402,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1401,307,1014.0,27.4,25.8,24.8,22.9,81.0,83.0,7.6,80.0,21.8,1,Train,307,False,1401,4
1402,308,1016.7,27.5,25.0,23.1,22.3,76.0,49.0,9.5,40.0,8.5,1,Train,308,False,1402,4
1403,76,1016.7,26.7,23.1,22.3,21.1,81.0,83.0,6.9,40.0,16.6,1,Train,309,True,1403,4
1404,22,1014.2,21.3,20.5,20.5,19.9,91.0,88.0,0.0,70.0,46.3,1,Train,310,True,1404,4
1405,311,1019.4,21.3,20.9,20.1,19.0,91.0,78.0,0.0,70.0,43.1,1,Train,311,False,1405,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1402,308,1016.7,27.5,25.0,23.1,22.3,76.0,49.0,9.5,40.0,8.5,1,Train,308,False,1402,4
1403,76,1016.7,26.7,23.1,22.3,21.1,81.0,83.0,6.9,40.0,16.6,1,Train,309,True,1403,4
1404,22,1014.2,21.3,20.5,20.5,19.9,91.0,88.0,0.0,70.0,46.3,1,Train,310,True,1404,4
1405,311,1019.4,21.3,20.9,20.1,19.0,91.0,78.0,0.0,70.0,43.1,1,Train,311,False,1405,4
1406,135,1018.1,21.0,20.8,19.4,19.6,91.0,88.0,0.0,20.0,16.6,1,Train,312,True,1406,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1404,22,1014.2,21.3,20.5,20.5,19.9,91.0,88.0,0.0,70.0,46.3,1,Train,310,True,1404,4
1405,311,1019.4,21.3,20.9,20.1,19.0,91.0,78.0,0.0,70.0,43.1,1,Train,311,False,1405,4
1406,135,1018.1,21.0,20.8,19.4,19.6,91.0,88.0,0.0,20.0,16.6,1,Train,312,True,1406,4
1407,111,1017.8,21.0,20.0,19.1,19.6,78.0,87.0,6.5,50.0,21.3,1,Train,313,True,1407,4
1408,314,1017.1,20.4,18.4,17.4,15.1,79.0,88.0,0.0,40.0,37.5,0,Train,314,False,1408,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1405,311,1019.4,21.3,20.9,20.1,19.0,91.0,78.0,0.0,70.0,43.1,1,Train,311,False,1405,4
1406,135,1018.1,21.0,20.8,19.4,19.6,91.0,88.0,0.0,20.0,16.6,1,Train,312,True,1406,4
1407,111,1017.8,21.0,20.0,19.1,19.6,78.0,87.0,6.5,50.0,21.3,1,Train,313,True,1407,4
1408,314,1017.1,20.4,18.4,17.4,15.1,79.0,88.0,0.0,40.0,37.5,0,Train,314,False,1408,4
1409,112,1019.9,21.9,21.0,20.4,18.0,79.0,84.0,0.0,50.0,25.0,1,Train,315,True,1409,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1407,111,1017.8,21.0,20.0,19.1,19.6,78.0,87.0,6.5,50.0,21.3,1,Train,313,True,1407,4
1408,314,1017.1,20.4,18.4,17.4,15.1,79.0,88.0,0.0,40.0,37.5,0,Train,314,False,1408,4
1409,112,1019.9,21.9,21.0,20.4,18.0,79.0,84.0,0.0,50.0,25.0,1,Train,315,True,1409,4
1410,316,1018.7,24.3,20.8,19.2,16.2,78.0,46.0,7.7,40.0,9.1,1,Train,316,False,1410,4
1411,317,1018.0,21.0,18.4,16.1,15.5,79.0,89.0,0.6,40.0,12.7,1,Train,317,False,1411,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1412,318,1018.9,24.8,22.3,20.7,20.8,91.0,93.0,0.0,40.0,13.7,1,Train,318,False,1412,4
1413,319,1021.3,27.6,23.4,22.4,21.7,75.0,89.0,2.3,70.0,25.5,1,Train,319,False,1413,4
1414,267,1022.5,27.6,25.4,24.3,16.9,75.0,88.0,0.0,10.0,26.3,1,Train,320,True,1414,4
1415,321,1018.4,22.0,17.7,15.5,14.9,74.0,67.0,5.0,10.0,33.2,1,Train,321,False,1415,4
1416,99,1022.5,18.6,17.7,16.9,13.3,75.0,90.0,0.0,60.0,38.0,1,Train,322,True,1416,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1414,267,1022.5,27.6,25.4,24.3,16.9,75.0,88.0,0.0,10.0,26.3,1,Train,320,True,1414,4
1415,321,1018.4,22.0,17.7,15.5,14.9,74.0,67.0,5.0,10.0,33.2,1,Train,321,False,1415,4
1416,99,1022.5,18.6,17.7,16.9,13.3,75.0,90.0,0.0,60.0,38.0,1,Train,322,True,1416,4
1417,323,1018.9,19.3,17.3,16.1,12.9,78.0,95.0,0.0,60.0,37.9,1,Train,323,False,1417,4
1418,324,1019.9,20.1,20.1,18.0,17.5,93.0,88.0,0.0,25.0,15.3,1,Train,324,False,1418,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1418,324,1019.9,20.1,20.1,18.0,17.5,93.0,88.0,0.0,25.0,15.3,1,Train,324,False,1418,4
1419,325,1022.5,21.2,18.2,16.3,12.1,75.0,56.0,0.6,10.0,10.2,1,Train,325,False,1419,4
1420,9,1022.2,24.4,19.7,18.4,16.0,83.0,95.0,1.2,70.0,15.5,1,Train,326,True,1420,4
1421,327,1022.6,26.0,18.8,17.6,15.9,62.0,86.0,3.5,60.0,14.5,1,Train,327,False,1421,4
1422,328,1021.2,19.8,17.9,17.2,14.4,71.0,95.0,0.0,70.0,40.4,1,Train,328,False,1422,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1426,332,1019.5,16.2,15.2,14.6,13.9,90.0,92.0,0.0,70.0,43.1,1,Train,332,False,1426,4
1427,333,1016.8,20.9,19.3,17.6,15.5,95.0,92.0,0.0,60.0,44.7,1,Train,333,False,1427,4
1428,116,1022.6,17.2,16.2,15.2,13.1,79.0,88.0,0.0,70.0,43.1,1,Train,334,True,1428,4
1429,335,1024.3,21.3,19.4,19.4,13.9,91.0,87.0,0.4,50.0,24.4,1,Train,335,False,1429,4
1430,102,1016.8,16.4,15.3,15.3,15.2,86.0,88.0,0.1,50.0,10.9,1,Train,336,True,1430,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1428,116,1022.6,17.2,16.2,15.2,13.1,79.0,88.0,0.0,70.0,43.1,1,Train,334,True,1428,4
1429,335,1024.3,21.3,19.4,19.4,13.9,91.0,87.0,0.4,50.0,24.4,1,Train,335,False,1429,4
1430,102,1016.8,16.4,15.3,15.3,15.2,86.0,88.0,0.1,50.0,10.9,1,Train,336,True,1430,4
1431,337,1016.8,16.7,15.5,14.2,12.9,79.0,94.0,0.0,40.0,19.0,1,Train,337,False,1431,4
1432,338,1020.4,18.4,15.3,14.6,12.9,79.0,88.0,0.0,60.0,38.0,1,Train,338,False,1432,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1436,342,1020.6,20.8,18.4,16.2,7.4,67.0,88.0,0.0,40.0,27.0,1,Train,342,False,1436,4
1437,343,1024.4,17.2,15.2,13.3,9.8,75.0,87.0,2.0,20.0,20.8,1,Train,343,False,1437,4
1438,133,1021.3,17.2,14.9,12.8,11.8,90.0,88.0,0.1,50.0,27.4,1,Train,344,True,1438,4
1439,91,1023.8,21.6,20.4,19.9,17.5,74.0,79.0,1.7,40.0,21.3,1,Train,345,True,1439,4
1440,346,1023.8,19.9,15.5,12.7,8.4,67.0,72.0,7.0,30.0,19.0,0,Train,346,False,1440,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1437,343,1024.4,17.2,15.2,13.3,9.8,75.0,87.0,2.0,20.0,20.8,1,Train,343,False,1437,4
1438,133,1021.3,17.2,14.9,12.8,11.8,90.0,88.0,0.1,50.0,27.4,1,Train,344,True,1438,4
1439,91,1023.8,21.6,20.4,19.9,17.5,74.0,79.0,1.7,40.0,21.3,1,Train,345,True,1439,4
1440,346,1023.8,19.9,15.5,12.7,8.4,67.0,72.0,7.0,30.0,19.0,0,Train,346,False,1440,4
1441,347,1024.3,17.4,17.3,16.3,15.3,89.0,88.0,0.0,40.0,28.8,1,Train,347,False,1441,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1443,349,1020.9,17.8,16.4,15.3,12.8,76.0,91.0,0.0,20.0,42.0,0,Train,349,False,1443,4
1444,350,1022.6,21.6,18.8,16.8,13.1,92.0,82.0,0.1,50.0,23.9,1,Train,350,False,1444,4
1445,15,1020.2,17.3,15.3,14.9,13.1,84.0,88.0,0.0,30.0,28.3,1,Train,351,True,1445,4
1446,352,1019.9,20.7,19.3,17.1,13.3,67.0,86.0,0.6,70.0,33.0,1,Train,352,False,1446,4
1447,353,1018.1,20.6,18.8,17.2,14.7,75.0,82.0,2.4,20.0,24.9,0,Train,353,False,1447,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1450,356,1016.8,21.1,18.8,17.4,15.3,75.0,88.0,0.0,50.0,30.3,1,Train,356,False,1450,4
1451,357,1022.2,21.6,18.2,16.4,14.6,77.0,88.0,0.0,50.0,24.8,1,Train,357,False,1451,4
1452,140,1023.2,20.6,17.8,14.4,13.4,75.0,88.0,0.0,20.0,23.3,1,Train,358,True,1452,4
1453,82,1017.1,18.9,17.5,16.5,16.8,88.0,91.0,0.0,20.0,30.0,1,Train,359,True,1453,4
1454,360,1016.8,18.9,17.7,16.9,16.8,96.0,95.0,0.0,50.0,17.5,1,Train,360,False,1454,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1451,357,1022.2,21.6,18.2,16.4,14.6,77.0,88.0,0.0,50.0,24.8,1,Train,357,False,1451,4
1452,140,1023.2,20.6,17.8,14.4,13.4,75.0,88.0,0.0,20.0,23.3,1,Train,358,True,1452,4
1453,82,1017.1,18.9,17.5,16.5,16.8,88.0,91.0,0.0,20.0,30.0,1,Train,359,True,1453,4
1454,360,1016.8,18.9,17.7,16.9,16.8,96.0,95.0,0.0,50.0,17.5,1,Train,360,False,1454,4
1455,361,1016.4,16.3,16.6,15.9,15.0,91.0,96.0,0.0,70.0,37.0,1,Train,361,False,1455,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1455,361,1016.4,16.3,16.6,15.9,15.0,91.0,96.0,0.0,70.0,37.0,1,Train,361,False,1455,4
1456,362,1013.9,18.9,17.1,15.3,16.9,79.0,94.0,0.0,70.0,24.0,1,Train,362,False,1456,4
1457,144,1013.0,22.2,20.5,19.1,17.9,79.0,81.0,2.2,70.0,28.6,1,Train,363,True,1457,4
1458,17,1014.5,18.7,17.6,17.0,17.4,97.0,95.0,0.0,50.0,25.0,1,Train,364,True,1458,4
1459,78,1016.6,21.1,19.1,17.1,13.0,94.0,97.0,0.0,30.0,50.6,1,Train,365,True,1459,4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1456,362,1013.9,18.9,17.1,15.3,16.9,79.0,94.0,0.0,70.0,24.0,1,Train,362,False,1456,4
1457,144,1013.0,22.2,20.5,19.1,17.9,79.0,81.0,2.2,70.0,28.6,1,Train,363,True,1457,4
1458,17,1014.5,18.7,17.6,17.0,17.4,97.0,95.0,0.0,50.0,25.0,1,Train,364,True,1458,4
1459,78,1016.6,21.1,19.1,17.1,13.0,94.0,97.0,0.0,30.0,50.6,1,Train,365,True,1459,4
1460,1,1021.9,19.3,18.1,17.3,16.3,95.0,100.0,0.0,70.0,26.5,1,Train,1,False,1460,5


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall,source_data,expected_day,mislabeled_day,elapsed_days,year
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1457,144,1013.0,22.2,20.5,19.1,17.9,79.0,81.0,2.2,70.0,28.6,1,Train,363,True,1457,4
1458,17,1014.5,18.7,17.6,17.0,17.4,97.0,95.0,0.0,50.0,25.0,1,Train,364,True,1458,4
1459,78,1016.6,21.1,19.1,17.1,13.0,94.0,97.0,0.0,30.0,50.6,1,Train,365,True,1459,4
1460,1,1021.9,19.3,18.1,17.3,16.3,95.0,100.0,0.0,70.0,26.5,1,Train,1,False,1460,5
1461,2,1015.3,21.9,18.7,17.3,16.8,85.0,95.0,0.0,60.0,21.9,1,Train,2,False,1461,5


### Potentially mislabeled days
 * There is only one row in which the 'expected_day' is in the set of the 'day' values, so the rows with a non consecutive 'day' are not simply in the wrong location. 
 * When looking at 'temperature' and 'dewpoint' graphically, the trend is better maintained when 'expected_day' is used instead of 'day' when 'expected_day' and 'day' do not match.
 * We will use 'expected_day' during training and prediction.

In [193]:
for pc in potential_corrections:
    if pc in original_and_corrected_days:
        f'{pc} {original_and_corrected_days[pc]}'
set(original_and_corrected_days.keys()).intersection(potential_corrections)

'116 [np.int64(334)]'

{np.int64(116)}

In [194]:
fig = px.scatter(combined_df, x='day', y='temparature', symbol='source_data', color='mislabeled_day', facet_col='year')
fig.show()
fig = px.scatter(combined_df, x='expected_day', y='temparature', symbol='source_data', color='mislabeled_day', facet_col='year')
fig.show()
fig = px.scatter(combined_df, x='elapsed_days', y='temparature', symbol='source_data', color='mislabeled_day')
fig.show()

In [195]:
fig = px.scatter(combined_df, x='day', y='dewpoint', symbol='source_data', color='mislabeled_day', facet_col='year')
fig.show()
fig = px.scatter(combined_df, x='expected_day', y='dewpoint', symbol='source_data', color='mislabeled_day', facet_col='year')
fig.show()
fig = px.scatter(combined_df, x='elapsed_days', y='dewpoint', symbol='source_data', color='mislabeled_day')
fig.show()

## Train data

In [196]:
raw_train_df.head()
raw_train_df.info()
raw_train_df.describe()
# Check for duplicate feature names
raw_train_df.loc[:, raw_train_df.columns.duplicated()].shape
# Check for missing values 
raw_train_df.isna().sum()

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


<class 'pandas.core.frame.DataFrame'>
Index: 2190 entries, 0 to 2189
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   day            2190 non-null   int64  
 1   pressure       2190 non-null   float64
 2   maxtemp        2190 non-null   float64
 3   temparature    2190 non-null   float64
 4   mintemp        2190 non-null   float64
 5   dewpoint       2190 non-null   float64
 6   humidity       2190 non-null   float64
 7   cloud          2190 non-null   float64
 8   sunshine       2190 non-null   float64
 9   winddirection  2190 non-null   float64
 10  windspeed      2190 non-null   float64
 11  rainfall       2190 non-null   int64  
dtypes: float64(10), int64(2)
memory usage: 287.0 KB


Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
count,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0,2190.0
mean,179.948402,1013.602146,26.365799,23.953059,22.170091,20.454566,82.03653,75.721918,3.744429,104.863151,21.804703,0.753425
std,105.203592,5.655366,5.65433,5.22241,5.05912,5.288406,7.800654,18.026498,3.626327,80.002416,9.898659,0.431116
min,1.0,999.0,10.4,7.4,4.0,-0.3,39.0,2.0,0.0,10.0,4.4,0.0
25%,89.0,1008.6,21.3,19.3,17.7,16.8,77.0,69.0,0.4,40.0,14.125,1.0
50%,178.5,1013.0,27.8,25.5,23.85,22.15,82.0,83.0,2.4,70.0,20.5,1.0
75%,270.0,1017.775,31.2,28.4,26.4,25.0,88.0,88.0,6.8,200.0,27.9,1.0
max,365.0,1034.6,36.0,31.5,29.8,26.7,98.0,100.0,12.1,300.0,59.5,1.0


(2190, 0)

day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    0
windspeed        0
rainfall         0
dtype: int64

### Basic cleanup
 * Make sure the index has only unique values
 * Remove any leading and trailing whitespace in the column names
 * Check for duplcated rows and columns
 * Check for:
     * rows and columns that contain all null/missing values
     * features (columns) that are invariable

In [197]:
train_df = raw_train_df.copy()
train_df.index.is_unique
# Remove leading and trailing whitespace form the column names
train_df.rename(columns=lambda x: x.strip(), inplace=True)
train_df.head()
train_df.tail()

True

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,1,1017.4,21.2,20.6,19.9,19.4,87.0,88.0,1.1,60.0,17.2,1
1,2,1019.5,16.2,16.9,15.8,15.4,95.0,91.0,0.0,50.0,21.9,1
2,3,1024.1,19.4,16.1,14.6,9.3,75.0,47.0,8.3,70.0,18.1,1
3,4,1013.4,18.1,17.8,16.9,16.8,95.0,95.0,0.0,60.0,35.6,1
4,5,1021.8,21.3,18.4,15.2,9.6,52.0,45.0,3.6,40.0,24.8,0


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed,rainfall
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2185,361,1014.6,23.2,20.6,19.1,19.9,97.0,88.0,0.1,40.0,22.1,1
2186,362,1012.4,17.2,17.3,16.3,15.3,91.0,88.0,0.0,50.0,35.3,1
2187,363,1013.3,19.0,16.3,14.3,12.6,79.0,79.0,5.0,40.0,32.9,1
2188,364,1022.3,16.4,15.2,13.8,14.7,92.0,93.0,0.1,40.0,18.0,1
2189,365,1013.8,21.2,19.1,18.0,18.0,89.0,88.0,1.0,70.0,48.0,1


In [198]:
train_df.shape
train_df.dropna(how='all').dropna(how='all', axis=1).shape
train_df.duplicated().any()
train_df.columns.duplicated().any()
for col in train_df:
    # The column is invariable if this is true
    if train_df[col].nunique() == 1:
        col

(2190, 12)

(2190, 12)

np.False_

np.False_

## Test data
#### The same data cleaning and transorming done to the train data will be done to the test data

In [199]:
raw_test_df.head()
raw_test_df.info()
raw_test_df.describe()
# Check for duplicate feature names
raw_test_df.loc[:, raw_test_df.columns.duplicated()].shape
# Check for missing values 
raw_test_df.isna().sum()

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


<class 'pandas.core.frame.DataFrame'>
Index: 730 entries, 2190 to 2919
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   day            730 non-null    int64  
 1   pressure       730 non-null    float64
 2   maxtemp        730 non-null    float64
 3   temparature    730 non-null    float64
 4   mintemp        730 non-null    float64
 5   dewpoint       730 non-null    float64
 6   humidity       730 non-null    float64
 7   cloud          730 non-null    float64
 8   sunshine       730 non-null    float64
 9   winddirection  729 non-null    float64
 10  windspeed      730 non-null    float64
dtypes: float64(10), int64(1)
memory usage: 68.4 KB


Unnamed: 0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
count,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,730.0,729.0,730.0
mean,183.0,1013.503014,26.372466,23.963288,22.110274,20.460137,82.669863,76.360274,3.664384,103.923182,22.484247
std,105.438271,5.505871,5.672521,5.278098,5.170744,5.391169,7.818714,17.934121,3.639272,81.695458,9.954779
min,1.0,1000.0,7.4,5.9,4.2,-0.0,39.0,0.0,0.0,10.0,4.5
25%,92.0,1008.725,21.6,19.825,17.825,16.8,77.25,69.0,0.325,40.0,14.5
50%,183.0,1012.7,27.8,25.65,23.9,22.3,82.0,83.0,2.2,70.0,21.3
75%,274.0,1017.6,31.0,28.375,26.4,25.0,89.0,88.0,6.675,200.0,28.4
max,365.0,1032.2,35.8,31.8,29.1,26.7,98.0,100.0,11.8,300.0,59.5


(730, 0)

day              0
pressure         0
maxtemp          0
temparature      0
mintemp          0
dewpoint         0
humidity         0
cloud            0
sunshine         0
winddirection    1
windspeed        0
dtype: int64

In [200]:
test_df = raw_test_df.copy()
test_df.index.is_unique
# Remove leading and trailing whitespace form the column names
test_df.rename(columns=lambda x: x.strip(), inplace=True)
test_df.head()
test_df.tail()
test_df.shape
test_df.dropna(how='all').dropna(how='all', axis=1).shape
test_df.duplicated().any()
test_df.columns.duplicated().any()
for col in test_df:
    # The column is invariable if this is true
    if test_df[col].nunique() == 1:
        col

True

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2190,1,1019.5,17.5,15.8,12.7,14.9,96.0,99.0,0.0,50.0,24.3
2191,2,1016.5,17.5,16.5,15.8,15.1,97.0,99.0,0.0,50.0,35.3
2192,3,1023.9,11.2,10.4,9.4,8.9,86.0,96.0,0.0,40.0,16.9
2193,4,1022.9,20.6,17.3,15.2,9.5,75.0,45.0,7.1,20.0,50.6
2194,5,1022.2,16.1,13.8,6.4,4.3,68.0,49.0,9.2,20.0,19.4


Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2915,361,1020.8,18.2,17.6,16.1,13.7,96.0,95.0,0.0,20.0,34.3
2916,362,1011.7,23.2,18.1,16.0,16.0,78.0,80.0,1.6,40.0,25.2
2917,363,1022.7,21.0,18.5,17.0,15.5,92.0,96.0,0.0,50.0,21.9
2918,364,1014.4,21.0,20.0,19.7,19.8,94.0,93.0,0.0,50.0,39.5
2919,365,1020.9,22.2,18.8,17.0,13.3,79.0,89.0,0.2,60.0,50.6


(730, 11)

(730, 11)

np.False_

np.False_

In [201]:
test_nan_indexes = test_df.loc[test_df['winddirection'].isna()].index.tolist()
window_distance = 2
for i in test_nan_indexes:
    window = [j for j in range(i-window_distance, i+window_distance+1)]
    test_df.reindex(window)

Unnamed: 0_level_0,day,pressure,maxtemp,temparature,mintemp,dewpoint,humidity,cloud,sunshine,winddirection,windspeed
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2705,151,1008.4,34.4,30.0,27.8,24.8,72.0,22.0,11.1,230.0,25.2
2706,152,1007.6,33.1,30.4,28.9,25.9,81.0,87.0,1.6,100.0,35.3
2707,153,1007.8,32.9,30.6,28.9,22.0,65.0,75.0,8.2,,17.2
2708,154,1008.6,33.7,30.5,28.2,25.0,89.0,83.0,3.9,220.0,11.7
2709,155,1009.5,33.9,30.2,27.5,25.8,74.0,30.0,10.8,220.0,15.8


## Sample submission

### Basic cleanup
#### The same data cleaning done to the train data will be done to the sample submission data

In [202]:
sample_submission_df = raw_sample_submission_df.copy()
sample_submission_df.index.is_unique
# Remove leading and trailing whitespace form the column names
sample_submission_df.rename(columns=lambda x: x.strip(), inplace=True)
sample_submission_df.head()
sample_submission_df.isna().sum()

True

Unnamed: 0_level_0,rainfall
id,Unnamed: 1_level_1
2190,0
2191,0
2192,0
2193,0
2194,0


rainfall    0
dtype: int64

In [203]:
train_df.to_csv(os.path.join(processed_dir, 'train.csv'))
test_df.to_csv(os.path.join(processed_dir, 'test.csv'))
sample_submission_df.to_csv(os.path.join(processed_dir, 'sample_submission.csv'))

### Feature engineering thoughts for machine learning pipelines
 * Derived and engineered features will be generated in the machine learning pipelines.
    * Simple derived features, such as 'expected_day', 'elapsed_days', and 'year' will be generated bu passing simple functions to apply.
    * More complicated engineered features, like 'season', will also be generated in the pipeline using something like K-means clustering.
 * Experiment with univariate, multivariate and/or KNN imputation for missing values.
    * There is only one missing value in the test data, so the method used for imputation will probably be inconsequential.
 * Experiment with standard scaling and min-max scaling for linear models.
 * Handle as much feature encoding as possible within the SciKit machine learning pipelines:
     * Nominal feature: OneHotEncoder
     * Ordinal features: OrdinalEncoder