# Temperature Data Preprocessing

In [1]:
import pandas as pd
import os
import numpy as np

## Extract ids and files for german stations

Let's extract station ids for german stations

In [391]:
df_stations = pd.read_csv('./data/ECA_blend_tg/info/stations.txt', skiprows=16)
not_german_stations_indices = df_stations[df_stations['CN']!='DE'].index
df_german_stations = df_stations.drop(index=not_german_stations_indices)

In [392]:
def strip_and_lower(string):
    return string.strip().lower()

In [393]:
df_german_stations.rename(strip_and_lower, axis='columns',inplace=True)
df_german_stations.columns

Index(['staid', 'staname', 'cn', 'lat', 'lon', 'hght'], dtype='object')

In [394]:
df_german_stations.rename(columns={'staid':'st_id', 'staname':'st_name'}, inplace=True)

In [395]:
df_german_stations['st_name'] = df_german_stations['st_name'].apply(strip_and_lower)
df_german_stations.drop(columns= ['cn','hght'], inplace=True)

In [396]:
def convert_id_to_filename(st_id):
    st_id = "{0:0=6d}".format(st_id)
    st_id = str(st_id)
    file_name = f'TG_STAID{st_id}.txt'
    return file_name

In [397]:
def degree_to_rad(x):
    l = x.split(':')
    x = int(l[0][1:])
    y = int(l[1])/60
    z = int(l[2])/3600
    output = (x + y + z) *np.pi / 180
    return output

In [398]:
df_german_stations['file'] = df_german_stations['st_id'].apply(convert_id_to_filename)
df_german_stations['lat_rad'] = df_german_stations['lat'].apply(degree_to_rad)
df_german_stations['lon_rad'] = df_german_stations['lon'].apply(degree_to_rad)
df_german_stations.tail()

Unnamed: 0,st_id,st_name,lat,lon,file,lat_rad,lon_rad
4111,11780,ulm-maehringen,+48:26:30,+009:55:17,TG_STAID011780.txt,0.845467,0.173161
4112,11781,hamburg (deutsche seewarte),+53:32:48,+009:58:17,TG_STAID011781.txt,0.934566,0.174034
4113,11782,kaufbeuren-oberbeuren,+47:52:32,+010:35:04,TG_STAID011782.txt,0.835586,0.184733
4114,11783,fronberg (schwandorf),+49:20:26,+012:07:58,TG_STAID011783.txt,0.861155,0.211757
4115,11784,pirmasens (klaeranlage),+49:12:42,+007:35:17,TG_STAID011784.txt,0.858906,0.132437


## Extract station ids for each capital of the 16 Bundesländer and find the best station (with the lowest nok measurements - sum of measurements proportion) for each one 

In [399]:
capitals = ['stuttgart', 'muenchen','berlin', 'potsdam','bremen', 'hamburg', 'wiesbaden', 'schwerin', 'hannover', 'dusseldorf',
            'mainz', 'saarbrucken', 'dresden', 'magdeburg', 'kiel', 'erfurt']

In [400]:
def clean(df) :    
    df.rename(strip_and_lower, axis='columns',inplace=True)
    df['date'] = pd.to_datetime(df['date'],format='%Y%m%d')
    df['tg'] = df['tg']*0.1
    return df

In [401]:
# extract and clean temperature data for a given station
def extract_and_clean(st_id):
    filename = convert_id_to_filename(st_id)
    file_path = f'./data/ECA_blend_tg/{filename}'
    df_station = pd.read_csv(file_path,skiprows=19)  
    df_station_cleaned = clean(df_station)
    df_station_cleaned.set_index('date',inplace=True)
    return df_station_cleaned

In [402]:
def return_number_of_measurement(st_id) :
    output = len(extract_and_clean(st_id))
    return output

In [403]:
def return_number_of_nok_measurements(st_id) :
    df_station = extract_and_clean(st_id)
    return (df_station['q_tg'] != 0).values.sum()

In [404]:
def get_all_stations_information(capital) :
    df_capital = df_german_stations[df_german_stations['st_name'].str.contains(capital)]
    df_capital['sum_measurements'] = df_capital['st_id'].apply(return_number_of_measurement)
    df_capital['nok_measurements'] = df_capital['st_id'].apply(return_number_of_nok_measurements)
    df_capital['nok_sum_proportion'] = df_capital['nok_measurements'] / df_capital['sum_measurements']
    df_capital.sort_values(by=['nok_sum_proportion'],inplace=True)    
    return df_capital

In [405]:
def get_best_station(capital) : 
    df_town = get_all_stations_information(capital)
    return df_town.iloc[0]['st_id']

In [406]:
capital_vs_best_station_id = {}
for capital in capitals:
    df_town = get_all_stations_information(capital)
    print(df_town.head())
    print('\n')
    capital_vs_best_station_id[capital] = df_town.iloc[0]['st_id']
capital_vs_best_station_id

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id                  st_name        lat         lon  \
980    2763   stuttgart/echterdingen  +48:41:21  +009:13:31   
1228   3506  stuttgart-schnarrenberg  +48:49:45  +009:12:03   
49       56                stuttgart  +48:46:13  +009:10:57   
2019   4671      stuttgart-hohenheim  +48:42:52  +009:12:37   
2017   4669    stuttgart (neckartal)  +48:47:26  +009:13:04   

                    file   lat_rad   lon_rad  sum_measurements  \
980   TG_STAID002763.txt  0.849786  0.161011             24623   
1228  TG_STAID003506.txt  0.852230  0.160585             22797   
49    TG_STAID000056.txt  0.851202  0.160265             43981   
2019  TG_STAID004671.txt  0.850227  0.160750             43981   
2017  TG_STAID004669.txt  0.851556  0.160881             27180   

      nok_measurements  nok_sum_proportion  
980                  0            0.000000  
1228                10            0.000439  
49                7518            0.170937  
2019              7518            0.170937  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id                   st_name        lat         lon  \
2050   4704       uberlingen/bodensee  +47:46:19  +009:09:25   
1923   4575            berlin-spandau  +52:34:05  +013:10:15   
1929   4581  berlin-tegeler fliesstal  +52:36:20  +013:17:48   
1904   4556        berlin-lichtenrade  +52:24:29  +013:24:51   
1375   4005              berlin-tegel  +52:33:56  +013:18:38   

                    file   lat_rad   lon_rad  sum_measurements  \
2050  TG_STAID004704.txt  0.833778  0.159819             27180   
1923  TG_STAID004575.txt  0.917486  0.229874             20971   
1929  TG_STAID004581.txt  0.918140  0.232071             23528   
1904  TG_STAID004556.txt  0.914693  0.234121             52747   
1375  TG_STAID004005.txt  0.917442  0.232313             52747   

      nok_measurements  nok_sum_proportion  
2050                 1            0.000037  
1923                 1            0.000048  
1929                 2            0.000085  
1904               195            0.00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id                st_name        lat         lon                file  \
35       42                 bremen  +53:02:47  +008:47:57  TG_STAID000042.txt   
2226   4884  bremen-seefahrtschule  +53:06:06  +008:47:03  TG_STAID004884.txt   

       lat_rad   lon_rad  sum_measurements  nok_measurements  \
35    0.925834  0.153574             47633               338   
2226  0.926799  0.153313             47633               338   

      nok_sum_proportion  
35              0.007096  
2226            0.007096  




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id                      st_name        lat         lon  \
40       47         hamburg fuhlsbuettel  +53:38:06  +009:59:24   
1542   4180   hamburg-botanischer garten  +53:33:41  +009:59:18   
1548   4186             hamburg-wandsbek  +53:35:12  +010:07:50   
4112  11781  hamburg (deutsche seewarte)  +53:32:48  +009:58:17   
1546   4184          hamburg-sankt pauli  +53:32:53  +009:58:13   

                    file   lat_rad   lon_rad  sum_measurements  \
40    TG_STAID000047.txt  0.936107  0.174358             47268   
1542  TG_STAID004180.txt  0.934823  0.174329             47268   
1548  TG_STAID004186.txt  0.935264  0.176812             47268   
4112  TG_STAID011781.txt  0.934566  0.174034             47268   
1546  TG_STAID004184.txt  0.934590  0.174014             30833   

      nok_measurements  nok_sum_proportion  
40                   2            0.000042  
1542                 2            0.000042  
1548                 2            0.000042  
4112                

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/i

    st_id   st_name        lat         lon                file   lat_rad  \
48     55  schwerin  +53:38:39  +011:23:18  TG_STAID000055.txt  0.936267   

     lon_rad  sum_measurements  nok_measurements  nok_sum_proportion  
48  0.198764             47633              2253            0.047299  


     st_id   st_name        lat         lon                file   lat_rad  \
343    476  hannover  +52:27:56  +009:40:46  TG_STAID000476.txt  0.915697   

      lon_rad  sum_measurements  nok_measurements  nok_sum_proportion  
343  0.168938             30833              1096            0.035546  




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id                 st_name        lat         lon  \
346     479              dusseldorf  +51:17:49  +006:46:12   
1399   4029  dusseldorf-sudfriedhof  +51:12:03  +006:45:29   

                    file   lat_rad   lon_rad  sum_measurements  \
346   TG_STAID000479.txt  0.895301  0.118159             31198   
1399  TG_STAID004029.txt  0.893623  0.117950             31198   

      nok_measurements  nok_sum_proportion  
346               3106            0.099558  
1399              3106            0.099558  


      st_id                  st_name        lat         lon  \
1735   4381  mainz-lerchenberg (zdf)  +49:58:05  +008:12:42   

                    file   lat_rad   lon_rad  sum_measurements  \
1735  TG_STAID004381.txt  0.872107  0.143321             26450   

      nok_measurements  nok_sum_proportion  
1735                40            0.001512  




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id                   st_name        lat         lon  \
1932   4584  saarbrucken-sankt johann  +49:13:27  +007:01:03   
2192   4850       saarbrucken-burbach  +49:14:30  +006:56:06   
358     491       saarbrucken/ensheim  +49:12:51  +007:06:29   

                    file   lat_rad   lon_rad  sum_measurements  \
1932  TG_STAID004584.txt  0.859124  0.122478             26450   
2192  TG_STAID004850.txt  0.859429  0.121039             26450   
358   TG_STAID000491.txt  0.858949  0.124059             25354   

      nok_measurements  nok_sum_proportion  
1932                 1            0.000038  
2192                 1            0.000038  
358                  1            0.000039  




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id             st_name        lat         lon                file  \
36       43   dresden wahnsdorf  +51:07:00  +013:40:59  TG_STAID000043.txt   
350     483   dresden-klotzsche  +51:07:45  +013:45:21  TG_STAID000483.txt   
1393   4023    dresden-strehlen  +51:01:31  +013:46:32  TG_STAID004023.txt   
1392   4022  dresden-hosterwitz  +51:01:24  +013:50:58  TG_STAID004022.txt   
4077  11744     dresden (mitte)  +51:03:20  +013:43:37  TG_STAID011744.txt   

       lat_rad   lon_rad  sum_measurements  nok_measurements  \
36    0.892154  0.238814             37772                 1   
350   0.892372  0.240085             37772                 1   
1393  0.890559  0.240429             30833              1429   
1392  0.890525  0.241718             30833              1430   
4077  0.891088  0.239580             70279             48872   

      nok_sum_proportion  
36              0.000026  
350             0.000026  
1393            0.046346  
1392            0.046379  
4077       

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

      st_id          st_name        lat         lon                file  \
1365   3995    kiel-holtenau  +54:22:32  +010:08:38  TG_STAID003995.txt   
1637   4281  kiel-kronshagen  +54:20:21  +010:05:39  TG_STAID004281.txt   
1368   3998  leuchtturm kiel  +54:30:01  +010:16:28  TG_STAID003998.txt   

       lat_rad   lon_rad  sum_measurements  nok_measurements  \
1365  0.949032  0.177044             29372               574   
1637  0.948397  0.176176             29372               574   
1368  0.951209  0.179323              8187               432   

      nok_sum_proportion  
1365            0.019542  
1637            0.019542  
1368            0.052767  


      st_id                     st_name        lat         lon  \
354     487         erfurt-bindersleben  +50:59:04  +010:57:47   
1884   4535  querfurt-muhle lodersleben  +51:23:26  +011:32:34   

                    file   lat_rad   lon_rad  sum_measurements  \
354   TG_STAID000487.txt  0.889846  0.191341             25354   
1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats 

{'stuttgart': 2763,
 'muenchen': 52,
 'berlin': 4704,
 'potsdam': 54,
 'bremen': 42,
 'hamburg': 47,
 'wiesbaden': 4764,
 'schwerin': 55,
 'hannover': 476,
 'dusseldorf': 479,
 'mainz': 4381,
 'saarbrucken': 4584,
 'dresden': 43,
 'magdeburg': 477,
 'kiel': 3995,
 'erfurt': 487}

In [407]:
df_german_stations_2 = df_german_stations.set_index('st_id') #set station_id as index to facilitate indexing with loc

In [408]:
df_german_stations_2.tail()

Unnamed: 0_level_0,st_name,lat,lon,file,lat_rad,lon_rad
st_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
11780,ulm-maehringen,+48:26:30,+009:55:17,TG_STAID011780.txt,0.845467,0.173161
11781,hamburg (deutsche seewarte),+53:32:48,+009:58:17,TG_STAID011781.txt,0.934566,0.174034
11782,kaufbeuren-oberbeuren,+47:52:32,+010:35:04,TG_STAID011782.txt,0.835586,0.184733
11783,fronberg (schwandorf),+49:20:26,+012:07:58,TG_STAID011783.txt,0.861155,0.211757
11784,pirmasens (klaeranlage),+49:12:42,+007:35:17,TG_STAID011784.txt,0.858906,0.132437


In [409]:
capital_best_st_ids = capital_vs_best_station_id.values()
df_land_capitals_best_stations = df_german_stations_2[df_german_stations_2.index.isin(list(capital_best_st_ids))]
df_land_capitals_best_stations

Unnamed: 0_level_0,st_name,lat,lon,file,lat_rad,lon_rad
st_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
42,bremen,+53:02:47,+008:47:57,TG_STAID000042.txt,0.925834,0.153574
43,dresden wahnsdorf,+51:07:00,+013:40:59,TG_STAID000043.txt,0.892154,0.238814
47,hamburg fuhlsbuettel,+53:38:06,+009:59:24,TG_STAID000047.txt,0.936107,0.174358
52,muenchen,+48:09:51,+011:32:39,TG_STAID000052.txt,0.840623,0.201484
54,potsdam,+52:23:00,+013:03:50,TG_STAID000054.txt,0.914262,0.228008
55,schwerin,+53:38:39,+011:23:18,TG_STAID000055.txt,0.936267,0.198764
476,hannover,+52:27:56,+009:40:46,TG_STAID000476.txt,0.915697,0.168938
477,magdeburg,+52:06:11,+011:35:03,TG_STAID000477.txt,0.90937,0.202182
479,dusseldorf,+51:17:49,+006:46:12,TG_STAID000479.txt,0.895301,0.118159
487,erfurt-bindersleben,+50:59:04,+010:57:47,TG_STAID000487.txt,0.889846,0.191341


In [410]:
capitals_files = df_land_capitals_best_stations['file'].tolist()

Extract and clean temperature data for a given station and for the last 20 years

In [412]:
def extract_and_clean(filename):
    file_path = f'./data/ECA_blend_tg/{filename}'
    df_station = pd.read_csv(file_path,skiprows=19)
    df_station.rename(strip_and_lower, axis='columns',inplace=True)
    df_station['date'] = pd.to_datetime(df_station['date'],format='%Y%m%d')
    df_station['tg'] = df_station['tg']*0.1
    df_station.set_index('date',inplace=True)
    df_station = df_station.loc['2000-06-01':]
    return df_station

Let's explore how many nok measurements we have for each station of the capitals. Quality label for nok measurements 'q_tg' is 9, and for ok measurements 0

In [413]:
for file in capitals_files:
    try: 
        print('station',extract_and_clean(file)['staid'][0],extract_and_clean(file)['q_tg'].value_counts()[9],'NOK')
    except KeyError :
        print ('station',extract_and_clean(file)['staid'][0],'all OK')
        pass

station 42 1 NOK
station 43 1 NOK
station 47 1 NOK
station 52 1 NOK
station 54 1 NOK
station 55 all OK
station 476 1 NOK
station 477 1 NOK
station 479 all OK
station 487 1 NOK
station 2763 all OK
station 3995 145 NOK
station 4381 32 NOK
station 4584 1 NOK
station 4704 1 NOK
station 4764 1 NOK


## Impute nok measurements for a single station with measurements from the next station

Let's define the Haversine calculation (great-circle distance between two positions on the earth)

In [322]:
def get_distance(lat1, lon1, lat2, lon2) :
    R = 6371
    output = 2*R*np.arcsin(np.sqrt((np.sin((lat2 -lat1)/2))**2 + np.cos(lat1)*np.cos(lat2)*(np.sin((lon2 -lon1)/2))**2))
    return output

In [365]:
def find_next_station(STAID,df_search) :
    
    next_staid = df_search.index[0]    
    
    lat1 = df_german_stations_2.loc[STAID,'lat_rad']
    lon1 = df_german_stations_2.loc[STAID,'lon_rad']
    
    lat2 = df_search.loc[next_staid,'lat_rad']
    lon2 = df_search.loc[next_staid,'lon_rad']
    
    minimal_distance = get_distance(lat1, lon1,lat2, lon2)
    
    for search_index in df_search.index[1:] :
        
        lat2 = df_search.loc[search_index,'lat_rad']
        lon2 = df_search.loc[search_index,'lon_rad']
        
        distance = get_distance(lat1, lon1,lat2, lon2)
        
        if distance < minimal_distance :
            minimal_distance = distance
            next_staid = search_index
            
    return next_staid       

In [379]:
def find_next_available_measurement(staid,date):
    
    df_search = df_german_stations_2.drop(index=staid)
    
    next_staid = find_next_station(staid,df_search)
    
    next_staid_file = df_german_stations_2.loc[next_staid,'file']
    next_staid_df = extract_and_clean(next_staid_file)
    next_staid_q_tg = next_staid_df.loc[date,'q_tg']
    
    while next_staid_q_tg == 9:
        
        df_search = df_search.drop(index=next_staid)
        
        next_staid = find_next_station(next_staid,df_search)

        
        next_staid_file = df_german_stations_2.loc[next_staid,'file']
        next_staid_df = extract_and_clean(next_staid_file)
        next_staid_q_tg = next_staid_df.loc[date,'q_tg']

    return next_staid_df.loc[date,'tg']

In [384]:
def impute_nok_temperatures(df):
    
    staid = df['staid'][0]
    nok_indices = df[df['q_tg'] == 9].index
    
    for i in nok_indices:
        df.loc[i,'tg'] = find_next_available_measurement(staid,i)
        df.loc[i,'imputed'] = 1
        
    return df

Let's test the imputation function on the worst case station 3995 which has 145 nok measurements

In [390]:
test_df = extract_and_clean(capitals_files[11])
(test_df['q_tg']==9).sum()

145

In [389]:
test_df_imputed = impute_nok_temperatures(test_df)
test_df_imputed['imputed'].sum()

145.0

## Another method to impute nok measurements: impute with shifted values

In [53]:
def impute_nok_temperatures_2(df) : 
    period_values = [1, 7, 365, 730, 1095, 1460, 1825 ]
    p = 0
    nok_indices = df[df['q_tg'] == 9].index
    nok_sum = (df.loc[nok_indices,'q_tg'] ==9).sum()    
    while nok_sum >0 :        
        period_value = period_values[p]
        df.loc[nok_indices,'tg'] = df.shift(periods=period_value).loc[nok_indices,'tg']
        df.loc[nok_indices,'q_tg'] = df.shift(periods=period_value).loc[nok_indices,'q_tg']
        nok_indices = df[df['q_tg'] == 9].index
        nok_sum = (df.loc[nok_indices,'q_tg'] ==9).sum()
        p += 1
    return df