In [2]:
import os
import pandas as pd
import numpy as np

import warnings

In [3]:
# !pip install folium
import folium as fm

In [4]:
warnings.filterwarnings("ignore")

In [5]:
airmq_folder = r'D:\good_stuff\meteo_minsk\data_airmq\unedit_data'
rp5_folder = r'D:\good_stuff\meteo_minsk\data_rp5\unedit_data'
belhydromet_folder = r'D:\good_stuff\meteo_minsk\data_belhydromet\unedit_data'

sensors_location_path = r'D:\good_stuff\meteo_minsk\sensor_info\all_sensors_locations.csv' 

## Reading AirMQ data

In [9]:
def join_airmq_csv_data(pattern, data_folder, time_column='time', sensor_type='AirMQ'):
    directory = os.listdir(path=fr'{data_folder}')
    
    memory_size = 0
    
    for i in range (len(directory)):
        if pattern in directory[i]:
            print(directory[i])
            final_df = pd.read_csv(f'{data_folder}\\{directory[i]}', encoding='utf-8')
            
            first_index = i
            files_quantity = 1
            break
     
    
    for file in directory[first_index+1:]:
        if pattern in file:
            print(file)
            temp_df = pd.read_csv(f'{data_folder}\\{file}', encoding='utf-8')

                
            final_df = final_df.append(temp_df)
            files_quantity += 1
            memory_size += os.path.getsize(f'{data_folder}\\{file}')
            
    else:
        if sensor_type != '':
                final_df['sensor_type'] = sensor_type
                                                  
        final_df[time_column] = pd.to_datetime(final_df[time_column], errors='coerce')                                                                     
        final_df['year'] = final_df[time_column].dt.year
        final_df['month'] = final_df[time_column].dt.month
        final_df['day'] = final_df[time_column].dt.day 
        final_df = final_df.reset_index()
        final_df.drop(['index', 'city', 'name'], axis=1, inplace=True)
        
        print()
        print(f'Quantity of using files - {files_quantity}.')
        print(f'Total memory of all files - {memory_size/1000} KB.')
        print(f'Shape of final dataframe: {final_df.shape[0]} rows, {final_df.shape[1]} columns.')
        return final_df
        

In [10]:
%%time
# AirMQ_PM
airmq_pm_sensors = join_airmq_csv_data(pattern='_PM_', data_folder = airmq_folder, sensor_type='AirMQ')
airmq_pm_sensors

AirMQ_PM_2020-07.csv
AirMQ_PM_2020-08.csv
AirMQ_PM_2020-09.csv
AirMQ_PM_2020-10.csv
AirMQ_PM_2020-11.csv
AirMQ_PM_2020-12.csv
AirMQ_PM_2021-01.csv
AirMQ_PM_2021-02.csv
AirMQ_PM_2021-03.csv
AirMQ_PM_2021-04.csv
AirMQ_PM_2021-05.csv
AirMQ_PM_2021-06.csv
AirMQ_PM_2021-07.csv
AirMQ_PM_2021-08.csv
AirMQ_PM_2021-09.csv
AirMQ_PM_2021-10.csv
AirMQ_PM_2021-11.csv
AirMQ_PM_2021-12.csv
AirMQ_PM_2022-01.csv
AirMQ_PM_2022-02.csv
AirMQ_PM_2022-03.csv
AirMQ_PM_2022-04.csv
AirMQ_PM_2022-05.csv
AirMQ_PM_2022-06.csv

Quantity of using files - 24.
Total memory of all files - 452600.621 KB.
Shape of final dataframe: 4684094 rows, 11 columns.
CPU times: total: 29.5 s
Wall time: 29.9 s


Unnamed: 0,_id,latitude,longitude,time,PMS1,PMS25,PMS10,sensor_type,year,month,day
0,BY050010003,53.925516,27.489385,2020-06-30 21:00:00+00:00,0.0,0.0,0.0,AirMQ,2020,6,30
1,BY050010003,53.925516,27.489385,2020-06-30 21:10:00+00:00,0.0,0.0,0.0,AirMQ,2020,6,30
2,BY050010003,53.925516,27.489385,2020-06-30 21:20:00+00:00,0.0,0.0,0.0,AirMQ,2020,6,30
3,BY050010003,53.925516,27.489385,2020-06-30 21:30:00+00:00,0.0,0.0,0.0,AirMQ,2020,6,30
4,BY050010003,53.925516,27.489385,2020-06-30 21:40:00+00:00,0.0,0.0,0.0,AirMQ,2020,6,30
...,...,...,...,...,...,...,...,...,...,...,...
4684089,BY050360141,53.855469,27.701027,2022-06-30 20:20:00+00:00,0.0,0.0,0.0,AirMQ,2022,6,30
4684090,BY050360141,53.855469,27.701027,2022-06-30 20:30:00+00:00,0.0,0.0,0.0,AirMQ,2022,6,30
4684091,BY050360141,53.855469,27.701027,2022-06-30 20:40:00+00:00,0.0,0.0,0.0,AirMQ,2022,6,30
4684092,BY050360141,53.855469,27.701027,2022-06-30 20:50:00+00:00,0.0,0.0,0.0,AirMQ,2022,6,30


In [11]:
for column in airmq_pm_sensors.columns[4:7]:
    print(f'Min of {column} - {airmq_pm_sensors[airmq_pm_sensors[column] != 0][column].min()}')

Min of PMS1 - 0.01
Min of PMS25 - 0.01
Min of PMS10 - 0.01


In [12]:
%%time
# AirMQ_Rad
airmq_rad_sensors = join_airmq_csv_data(pattern='_Rad_',
                                        sensor_type='AirMQ',
                                        data_folder= airmq_folder)
airmq_rad_sensors

AirMQ_Rad_2022-01.csv
AirMQ_Rad_2022-02.csv
AirMQ_Rad_2022-03.csv
AirMQ_Rad_2022-04.csv
AirMQ_Rad_2022-05.csv
AirMQ_Rad_2022-06.csv

Quantity of using files - 6.
Total memory of all files - 87464.008 KB.
Shape of final dataframe: 1160331 rows, 9 columns.
CPU times: total: 6.47 s
Wall time: 6.51 s


Unnamed: 0,_id,latitude,longitude,time,Count,sensor_type,year,month,day
0,BY050010000,53.847298,27.497185,2021-12-31 21:00:00+00:00,0.0,AirMQ,2021,12,31
1,BY050010000,53.847298,27.497185,2021-12-31 21:10:00+00:00,0.0,AirMQ,2021,12,31
2,BY050010000,53.847298,27.497185,2021-12-31 21:20:00+00:00,0.0,AirMQ,2021,12,31
3,BY050010000,53.847298,27.497185,2021-12-31 21:30:00+00:00,0.0,AirMQ,2021,12,31
4,BY050010000,53.847298,27.497185,2021-12-31 21:40:00+00:00,0.0,AirMQ,2021,12,31
...,...,...,...,...,...,...,...,...,...
1160326,BY050360141,53.855469,27.701027,2022-06-30 20:20:00+00:00,0.0,AirMQ,2022,6,30
1160327,BY050360141,53.855469,27.701027,2022-06-30 20:30:00+00:00,0.0,AirMQ,2022,6,30
1160328,BY050360141,53.855469,27.701027,2022-06-30 20:40:00+00:00,0.0,AirMQ,2022,6,30
1160329,BY050360141,53.855469,27.701027,2022-06-30 20:50:00+00:00,0.0,AirMQ,2022,6,30


In [13]:
count_min = airmq_rad_sensors[airmq_rad_sensors['Count'] != 0]['Count'].min()
print(f"Min of AirMQ Count which don't equal zero - {count_min}")

Min of AirMQ Count which don't equal zero - 7.62


In [14]:
%%time
# AirMQ_T
airmq_t_sensors = join_airmq_csv_data(pattern='_T_', 
                                      sensor_type='AirMQ',
                                      data_folder=airmq_folder)
airmq_t_sensors.shape

AirMQ_T_2020-07.csv
AirMQ_T_2020-08.csv
AirMQ_T_2020-09.csv
AirMQ_T_2020-10.csv
AirMQ_T_2020-11.csv
AirMQ_T_2020-12.csv
AirMQ_T_2021-01.csv
AirMQ_T_2021-02.csv
AirMQ_T_2021-03.csv
AirMQ_T_2021-04.csv
AirMQ_T_2021-05.csv
AirMQ_T_2021-06.csv
AirMQ_T_2021-07.csv
AirMQ_T_2021-08.csv
AirMQ_T_2021-09.csv
AirMQ_T_2021-10.csv
AirMQ_T_2021-11.csv
AirMQ_T_2021-12.csv
AirMQ_T_2022-01.csv
AirMQ_T_2022-02.csv
AirMQ_T_2022-03.csv
AirMQ_T_2022-04.csv
AirMQ_T_2022-05.csv
AirMQ_T_2022-06.csv

Quantity of using files - 24.
Total memory of all files - 414189.303 KB.
Shape of final dataframe: 4618127 rows, 9 columns.
CPU times: total: 27.2 s
Wall time: 27.6 s


(4618127, 9)

In [15]:
min_temp = airmq_t_sensors[(airmq_t_sensors['Temp'] != 0)]['Temp'].min()
print(f'Min of AirMQ "Temp" - {min_temp}')

Min of AirMQ "Temp" - -211.73


## Reading rp5_data

In [16]:
def threshold_Nans(df, nans_treshold=0.99):
    valid_columns = []
    for col in df.columns:
        per_of_NaN = sum(df[col].isna()) / len(df[col])
        if per_of_NaN <= nans_treshold:
            valid_columns.append(col)
            
    return df[valid_columns]


def join_rp5_csv_data(pattern, 
                      sensors_location_file, 
                      data_folder, 
                      time_column='DateTime', 
                      sep=',',
                      threshold_Nans=None):
    
    sensors_info = pd.read_csv(sensors_location_file, encoding='utf-8')
    
    directory = os.listdir(path=fr'{data_folder}')
    
    memory_size = 0
    
    for i in range (len(directory)):
        if pattern in directory[i]:
            final_df = pd.read_csv(f'{data_folder}\\{directory[i]}', encoding='utf-8', sep=sep)
            final_df['_id'] = directory[i][-14:-7]
            
            first_index = i
            files_quantity = 1
            break
     
    
    for file in directory[first_index+1:]:
        if pattern in file:
            print(file)
            temp_df = pd.read_csv(f'{data_folder}\\{file}', encoding='utf-8', sep=sep)
            temp_df['_id'] = file[-14:-7]

                
            final_df = final_df.append(temp_df)
            files_quantity += 1
            memory_size += os.path.getsize(f'{data_folder}\\{file}')
            
            
    else:
        final_df = final_df[final_df['DateTime'] != 'DateTime']
        
        final_df = final_df.merge(sensors_info, 
                                  how='inner', 
                                  left_on='_id', 
                                  right_on='ID')
        
        if threshold_Nans != None:
            final_df = valid_columns(final_df)
                                                  
        final_df[time_column] = pd.to_datetime(final_df[time_column], errors='coerce')                                                                     
        final_df['year'] = final_df[time_column].dt.year
        final_df['month'] = final_df[time_column].dt.month
        final_df['day'] = final_df[time_column].dt.day 
        
        
        final_df = final_df.rename(columns={'DateTime':'time',
                                            'Latitude':'latitude',
                                            'Longitude':'longitude', 
                                            'Sensor_type':'sensor_type'})
    
        
        final_df = final_df.reset_index()
        final_df.drop(['index', 'ID', 'Address'], axis=1, inplace=True)
        
        print()
        print(f'Quantity of using files - {files_quantity}.')
        print(f'Total memory of all files - {memory_size/1000} KB.')
        print(f'Shape of final dataframe: {final_df.shape[0]} rows, {final_df.shape[1]} columns.')
        return final_df
        

In [17]:
%%time
rp5_th = join_rp5_csv_data(pattern='rp5_th_', 
                           data_folder=rp5_folder,
                           sensors_location_file=sensors_location_path,
                           sep=';')
rp5_th

rp5_th_data_268493u_v1.csv
rp5_th_data_268494u_v1.csv
rp5_th_data_268498u_v1.csv
rp5_th_data_268499u_v1.csv
rp5_th_data_268503u_v1.csv
rp5_th_data_268505u_v1.csv
rp5_th_data_268506u_v1.csv
rp5_th_data_268507u_v1.csv
rp5_th_data_268508u_v1.csv
rp5_th_data_268509u_v1.csv
rp5_th_data_268511u_v1.csv
rp5_th_data_268517u_v1.csv
rp5_th_data_268520u_v1.csv
rp5_th_data_268523u_v1.csv
rp5_th_data_268524u_v1.csv
rp5_th_data_268529u_v1.csv
rp5_th_data_268530u_v1.csv
rp5_th_data_268531u_v1.csv
rp5_th_data_268534u_v1.csv
rp5_th_data_268537u_v1.csv
rp5_th_data_268538u_v1.csv
rp5_th_data_268539u_v1.csv
rp5_th_data_268545u_v1.csv
rp5_th_data_268546u_v1.csv
rp5_th_data_268547u_v1.csv
rp5_th_data_268549u_v1.csv
rp5_th_data_268550u_v1.csv

Quantity of using files - 28.
Total memory of all files - 41248.266 KB.
Shape of final dataframe: 908508 rows, 12 columns.
CPU times: total: 5.78 s
Wall time: 5.84 s


Unnamed: 0,time,Temperature,RelHumidity,IsDaytime,SunElevationAngle,_id,latitude,longitude,sensor_type,year,month,day
0,2020-11-10 15:06:00,7.0,89,1,13.344760,268492u,53.8713,27.5525,rp5,2020,11,10
1,2020-11-10 15:16:00,6.9,89,1,12.529220,268492u,53.8713,27.5525,rp5,2020,11,10
2,2020-11-10 15:26:00,6.9,89,1,11.665290,268492u,53.8713,27.5525,rp5,2020,11,10
3,2020-11-10 15:36:00,7.0,89,1,10.754874,268492u,53.8713,27.5525,rp5,2020,11,10
4,2020-11-10 15:46:00,6.9,89,1,9.799903,268492u,53.8713,27.5525,rp5,2020,11,10
...,...,...,...,...,...,...,...,...,...,...,...,...
908503,2022-06-19 23:07:00,23.9,64,0,-8.168258,268550u,53.9487,27.7492,rp5,2022,6,19
908504,2022-06-19 23:18:00,23.8,64,0,-8.907200,268550u,53.9487,27.7492,rp5,2022,6,19
908505,2022-06-19 23:28:00,23.6,64,0,-9.525013,268550u,53.9487,27.7492,rp5,2022,6,19
908506,2022-06-19 23:38:00,23.5,65,0,-10.090017,268550u,53.9487,27.7492,rp5,2022,6,19


## Reading_belhydromet data

In [27]:
def filter_by_Nans(df, nans_treshold=0.99):
    valid_columns = []
    for col in df.columns:
        per_of_NaN = sum(df[col].isna()) / len(df[col])
        if per_of_NaN <= nans_treshold:
            valid_columns.append(col)
            
    return df[valid_columns]


def join_belhydromet_csv_data(pattern, 
                      sensors_location_file, 
                      data_folder, 
                      time_column='DateTime', 
                      sep=',',
                      threshold_Nans=None):
    
    sensors_info = pd.read_csv(sensors_location_file, encoding='utf-8')
    
    directory = os.listdir(path=fr'{data_folder}')
    
    memory_size = 0
    
    for i in range (len(directory)):
        if pattern in directory[i]:
            final_df = pd.read_csv(f'{data_folder}\\{directory[i]}', encoding='utf-8', sep=sep)
            final_df['_id'] = directory[i][-12:-7]
            
            first_index = i
            files_quantity = 1
            break
     
    
    for file in directory[first_index+1:]:
        if pattern in file:
            print(file)
            temp_df = pd.read_csv(f'{data_folder}\\{file}', encoding='utf-8', sep=sep)
            temp_df['_id'] = file[-12:-7]

                
            final_df = final_df.append(temp_df)
            files_quantity += 1
            memory_size += os.path.getsize(f'{data_folder}\\{file}')
            
            
    else:
        final_df = final_df[final_df['DateTime'] != 'DateTime']
        
        final_df = final_df.merge(sensors_info, 
                                  how='inner', 
                                  left_on='_id', 
                                  right_on='ID')
        
        if threshold_Nans != None:
            final_df = filter_by_Nans(final_df)
                                                  
        final_df[time_column] = pd.to_datetime(final_df[time_column], errors='coerce')                                                                     
        final_df['year'] = final_df[time_column].dt.year
        final_df['month'] = final_df[time_column].dt.month
        final_df['day'] = final_df[time_column].dt.day 
        
        
        final_df = final_df.rename(columns={'DateTime':'time',
                                            'Latitude':'latitude',
                                            'Longitude':'longitude', 
                                            'Sensor_type':'sensor_type'})
    
        
        final_df = final_df.reset_index()
        final_df.drop(['index', 'ID', 'Address'], axis=1, inplace=True)
        
        print()
        print(f'Quantity of using files - {files_quantity}.')
        print(f'Total memory of all files - {memory_size/1000} KB.')
        print(f'Shape of final dataframe: {final_df.shape[0]} rows, {final_df.shape[1]} columns.')
        return final_df
        

In [28]:
%%time
hydromet_misc = join_belhydromet_csv_data(pattern='hydromet_misc',
                                          sensors_location_file=sensors_location_path,
                                          data_folder=belhydromet_folder,
                                          sep=';' 
                                         )
hydromet_misc

hydromet_misc_data_26843_v1.csv
hydromet_misc_data_26850_v1.csv
hydromet_misc_data_26851_v1.csv
hydromet_misc_data_26852_v1.csv
hydromet_misc_data_26854_v1.csv
hydromet_misc_data_26856_v1.csv
hydromet_misc_data_26857_v1.csv
hydromet_misc_data_26858_v1.csv

Quantity of using files - 9.
Total memory of all files - 34673.702 KB.
Shape of final dataframe: 509618 rows, 33 columns.
CPU times: total: 8.09 s
Wall time: 8.08 s


Unnamed: 0,time,SnowHeight,SnowHeightInstant,Visibility1,Visibility10,CloudsHeight,CloudsHeightMin,Precip,Precip10,IntenPrecip,...,SolarRad,SolarRad10,SolarRad60,_id,latitude,longitude,sensor_type,year,month,day
0,2021-02-01 00:00:00,,,,,,,0.0,0.0,0.0,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
1,2021-02-01 00:10:00,,,,,,,0.0,0.0,0.0,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
2,2021-02-01 00:20:00,,,,,,,0.0,0.0,0.0,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
3,2021-02-01 00:30:00,,,,,,,0.0,0.0,0.0,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
4,2021-02-01 00:40:00,,,,,,,0.0,0.0,0.0,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509613,2022-04-07 06:10:00,,,,,,,0.1,0.0,0.0,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7
509614,2022-04-07 06:20:00,,,,,,,0.1,0.0,0.0,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7
509615,2022-04-07 06:30:00,,,,,,,0.1,0.0,0.0,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7
509616,2022-04-07 06:40:00,,,,,,,0.1,0.0,0.0,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7


In [29]:
hydromet_misc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509618 entries, 0 to 509617
Data columns (total 33 columns):
 #   Column             Non-Null Count   Dtype         
---  ------             --------------   -----         
 0   time               509618 non-null  datetime64[ns]
 1   SnowHeight         58011 non-null   object        
 2   SnowHeightInstant  54306 non-null   object        
 3   Visibility1        131138 non-null  object        
 4   Visibility10       131131 non-null  object        
 5   CloudsHeight       0 non-null       object        
 6   CloudsHeightMin    0 non-null       object        
 7   Precip             503718 non-null  object        
 8   Precip10           500844 non-null  object        
 9   IntenPrecip        377248 non-null  object        
 10  TempSoil3          83398 non-null   object        
 11  TempSoil3Min       0 non-null       object        
 12  TempSoil5          131573 non-null  object        
 13  TempSoil10         131630 non-null  object  

##### Usually, removing data (even NaN-data) isn't good practice. But this example with 5 absolutely empty columns seducts us to drop columns where percent of information < 0.01. This dataframe won't be used, it's only for example.

In [30]:
%%time
hydromet_misc_edit = join_belhydromet_csv_data(pattern='hydromet_misc',
                                          sensors_location_file=sensors_location_path,
                                          data_folder=belhydromet_folder,
                                          sep=';', threshold_Nans=0.0001 
                                         )
hydromet_misc_edit
# hydromet_misc_edit contains 28 instead of 33 

hydromet_misc_data_26843_v1.csv
hydromet_misc_data_26850_v1.csv
hydromet_misc_data_26851_v1.csv
hydromet_misc_data_26852_v1.csv
hydromet_misc_data_26854_v1.csv
hydromet_misc_data_26856_v1.csv
hydromet_misc_data_26857_v1.csv
hydromet_misc_data_26858_v1.csv

Quantity of using files - 9.
Total memory of all files - 34673.702 KB.
Shape of final dataframe: 509618 rows, 28 columns.
CPU times: total: 11.4 s
Wall time: 11.5 s


Unnamed: 0,time,SnowHeight,SnowHeightInstant,Visibility1,Visibility10,Precip,Precip10,IntenPrecip,TempSoil3,TempSoil5,...,SolarRad,SolarRad10,SolarRad60,_id,latitude,longitude,sensor_type,year,month,day
0,2021-02-01 00:00:00,,,,,0.0,0.0,0.0,,,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
1,2021-02-01 00:10:00,,,,,0.0,0.0,0.0,,,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
2,2021-02-01 00:20:00,,,,,0.0,0.0,0.0,,,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
3,2021-02-01 00:30:00,,,,,0.0,0.0,0.0,,,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
4,2021-02-01 00:40:00,,,,,0.0,0.0,0.0,,,...,,,,26842,53.884,27.442,Belhydromet,2021,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509613,2022-04-07 06:10:00,,,,,0.1,0.0,0.0,,,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7
509614,2022-04-07 06:20:00,,,,,0.1,0.0,0.0,,,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7
509615,2022-04-07 06:30:00,,,,,0.1,0.0,0.0,,,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7
509616,2022-04-07 06:40:00,,,,,0.1,0.0,0.0,,,...,,,,26858,53.896,27.644,Belhydromet,2022,4,7


In [31]:
%%time
hydromet_pth = join_belhydromet_csv_data(pattern='hydromet_pth',
                                          sensors_location_file=sensors_location_path,
                                          data_folder=belhydromet_folder,
                                          sep=';' 
                                         )


hydromet_pth_data_26843_v1.csv
hydromet_pth_data_26850_v1.csv
hydromet_pth_data_26851_v1.csv
hydromet_pth_data_26852_v1.csv
hydromet_pth_data_26854_v1.csv
hydromet_pth_data_26856_v1.csv
hydromet_pth_data_26857_v1.csv
hydromet_pth_data_26858_v1.csv

Quantity of using files - 9.
Total memory of all files - 43059.538 KB.
Shape of final dataframe: 509618 rows, 19 columns.
CPU times: total: 5.69 s
Wall time: 5.74 s


In [32]:
%%time
hydromet_wind = join_belhydromet_csv_data(pattern='hydromet_wind',
                                          sensors_location_file=sensors_location_path,
                                          data_folder=belhydromet_folder,
                                          sep=';' 
                                         )
hydromet_wind

hydromet_wind_data_26843_v1.csv
hydromet_wind_data_26850_v1.csv
hydromet_wind_data_26851_v1.csv
hydromet_wind_data_26852_v1.csv
hydromet_wind_data_26854_v1.csv
hydromet_wind_data_26856_v1.csv
hydromet_wind_data_26857_v1.csv
hydromet_wind_data_26858_v1.csv

Quantity of using files - 9.
Total memory of all files - 34519.471 KB.
Shape of final dataframe: 509618 rows, 16 columns.
CPU times: total: 5.02 s
Wall time: 5.08 s


Unnamed: 0,time,DirectW,DirectW2,SpeedW,SpeedW2,SpeedWMax,SpeedWInstant,IsDaytime,SunElevationAngle,_id,latitude,longitude,sensor_type,year,month,day
0,2021-02-01 00:00:00,210.0,190.0,2.0,1.9,4.9,0.5,0,-48.75916322790047,26842,53.884,27.442,Belhydromet,2021,2,1
1,2021-02-01 00:10:00,220.0,220.0,2.3,2.6,5.0,1.9,0,-47.85227479180017,26842,53.884,27.442,Belhydromet,2021,2,1
2,2021-02-01 00:20:00,225.0,225.0,2.2,2.2,4.3,1.6,0,-46.88050864084015,26842,53.884,27.442,Belhydromet,2021,2,1
3,2021-02-01 00:30:00,215.0,205.0,1.8,1.4,4.3,1.1,0,-45.84886943481753,26842,53.884,27.442,Belhydromet,2021,2,1
4,2021-02-01 00:40:00,205.0,200.0,1.5,1.8,3.4,2.2,0,-44.76222872645329,26842,53.884,27.442,Belhydromet,2021,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
509613,2022-04-07 06:10:00,135.0,135.0,2.5,2.4,4.9,2.4,1,22.646927003673134,26858,53.896,27.644,Belhydromet,2022,4,7
509614,2022-04-07 06:20:00,135.0,130.0,2.5,2.3,5.0,2.1,1,24.016997633251734,26858,53.896,27.644,Belhydromet,2022,4,7
509615,2022-04-07 06:30:00,130.0,125.0,2.6,3.1,4.3,2.8,1,25.364276095157795,26858,53.896,27.644,Belhydromet,2022,4,7
509616,2022-04-07 06:40:00,130.0,120.0,3.0,3.3,5.8,3.4,1,26.68616440273956,26858,53.896,27.644,Belhydromet,2022,4,7


## All Data map

In [33]:
def select_by_date(df, year, month, day):
    select_df= df.loc[(df['year'] == year) &
                      (df['month'] == month) &
                      (df['day'] == day)]
    return select_df


def exist_data(df):
   
    not_item_column = ['_id', 'latitude', 'longitude',
                       'time', 'sensor_type', 'year', 'month', 'day']
    
    if sum(df['sensor_type'] == 'AirMQ') > 0:
        df.loc[:, ~df.columns.isin(not_item_column)].replace(0, np.nan, inplace=True)
    
    nan_data_sum = df.loc[:, ~df.columns.isin(not_item_column)].isna().sum(axis = 1)  
    all_items = df.loc[:, ~df.columns.isin(not_item_column)].shape[1]
    df['per_row_nans'] = nan_data_sum / all_items
    valid_df = df[df['per_row_nans'] != 1]
    valid_df.drop('per_row_nans', axis = 1, inplace=True)
    
    return valid_df

def sensors_map_by_date(dataframes, year, month, day, zoom_center=[53.9009, 27.5577]):
    loc_center = zoom_center
    map1 = fm.Map(location = loc_center, 
                  tiles='Openstreetmap', 
                  zoom_start = 8, 
                  control_scale=True)
      
    for df in dataframes:
        df_by_date = select_by_date(df, year, month, day)
            
        exist_data_df = exist_data(df_by_date)

        for index, loc in exist_data_df.iterrows():
            if loc['sensor_type'] == 'Belhydromet':
                color = '#2EE80C'
            elif loc['sensor_type'] == 'rp5':
                color = '#FFF100'
            else:
                color = '#FFAB24'
            fm.CircleMarker([loc['latitude'], loc['longitude']],
                                radius=2, weight=5, 
                                popup= f"{loc['sensor_type']}",
                                color = color).add_to(map1)
    return map1

In [None]:
%%time
df_list = [airmq_pm_sensors, airmq_rad_sensors, airmq_t_sensors, rp5_th, hydromet_misc, hydromet_pth, hydromet_wind]
sensors_map_by_date(df_list, 2021, 2, 1)            

CPU times: total: 7.73 s
Wall time: 7.78 s


In [None]:
print ('Yes, this is dev branch')