# Inerpolating missing values for certain variables

In [10]:
# essential imports for the notebook
import os
import numpy as np
import pandas as pd
import xarray as xr

from cartopy import config
import cartopy.feature as cfeature
import cartopy.crs as ccrs
from cartopy.mpl.geoaxes import GeoAxes
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import AxesGrid
from mpl_toolkits.axes_grid1 import ImageGrid
import plotly.express as px

from cartopy.mpl.ticker import LongitudeFormatter, LatitudeFormatter
import matplotlib.pyplot as plt
import utils
# import scipy.cluster.hierarchy as sch

In [2]:
path_X_station_train = "./data/Train/X_station_train.csv"
path_Y_train = "data./Train/Y_train.csv"
X_station_train = pd.read_csv(path_X_station_train, sep = ",")
Y_train = pd.read_csv(path_Y_train, sep = ",")

In [3]:
print(X_station_train.shape)
X_station_train.head()

(4409474, 9)


Unnamed: 0,number_sta,date,ff,t,td,hu,dd,precip,Id
0,14066001,2016-01-01 00:00:00,3.05,279.28,277.97,91.4,200.0,0.0,14066001_0_0
1,14066001,2016-01-01 01:00:00,2.57,278.76,277.45,91.4,190.0,0.0,14066001_0_1
2,14066001,2016-01-01 02:00:00,2.26,278.27,277.02,91.7,181.0,0.0,14066001_0_2
3,14066001,2016-01-01 03:00:00,2.62,277.98,276.95,93.0,159.0,0.0,14066001_0_3
4,14066001,2016-01-01 04:00:00,2.99,277.32,276.72,95.9,171.0,0.0,14066001_0_4


In [4]:
# count the number of distinct stations in the dataset
number_of_stations = X_station_train.number_sta.nunique()

In [5]:
print(Y_train.shape)
Y_train.head()

(183747, 4)


Unnamed: 0,date,number_sta,Ground_truth,Id
0,2016-01-02,14066001,3.4,14066001_0
1,2016-01-02,14126001,0.5,14126001_0
2,2016-01-02,14137001,3.4,14137001_0
3,2016-01-02,14216001,4.0,14216001_0
4,2016-01-02,14296001,13.3,14296001_0


In [6]:
# make an array for all variables 
variable_list = X_station_train.columns.values
arrays = []
m = len(variable_list)
for i in range(m):
    arrays.append(X_station_train[variable_list[i]].values)
    
# array for each variable 
(number_sta, date, ff, t, 
 td, hu, dd, precip, Id) = arrays

In [7]:
station_coords = pd.read_csv('./data/Other/stations_coordinates.csv')
station_coords.head()

Unnamed: 0,number_sta,lat,lon,height_sta
0,86118001,46.477,0.985,120.0
1,86149001,46.917,0.025,60.0
2,56081003,48.05,-3.66,165.0
3,53215001,47.79,-0.71,63.0
4,22135001,48.55,-3.38,148.0


In [8]:
# plot the stations on a map
def plot_stations(station_coords):
    """Plots the stations on a map/

    Args:
        station_coords (data_frame): dataframe containing the necessary data.
    """
    fig = px.scatter_geo(
        station_coords,
        lat = 'lat',
        lon = 'lon',
        hover_name = 'number_sta',
    )
    fig.update_layout(title = 'Stations')
    return fig

In [11]:
fig = plot_stations(station_coords)

In [12]:
# HAS BEEN TESTED, WORKS AS INTENDED
def k_closest_stations(number_station, k):
    """Get the closest station to the one given as input.

    Args:
        number_station (integer): number of the station
        k (integer): number of the closest stations to return
    Returns:
        closest_stations (integer): list of the closest stations
    """
    # get the coordinates of the station given as input
    station_lat = station_coords[station_coords['number_sta'] == number_station]['lat'].values[0]
    station_lon = station_coords[station_coords['number_sta'] == number_station]['lon'].values[0]
    # calculate the distance between the station given as input and all the other stations
    stations_and_id = []
    for i in range(len(station_coords)):
        if station_coords['number_sta'].values[i] != number_station:
            lat = station_coords['lat'].values[i]
            lon = station_coords['lon'].values[i]
            distance = np.sqrt((station_lat - lat)**2 + (station_lon - lon)**2)
            stations_and_id.append([station_coords['number_sta'].values[i], distance])
    stations_and_id = np.array(stations_and_id)
    #sort the stations by distance
    stations_and_id = stations_and_id[stations_and_id[:,1].argsort()]
    closest_stations = stations_and_id[:k,0].astype(int)
    return closest_stations

In [13]:
# HAS BEEN TESTED, WORKS AS INTENDED
def get_values_for_station(station_number, year, month, day):
    """Get the values of all variables of a station,
       data matching the date in the input.
       Please create columns for the year, dmonth and day.

    Args:
        station_number (integer): number of the station
        year, month, day (integer): year, month and day of the date
    Returns:
        df (dataframe): dataframe containing the values of 
        all variables of a station at a certain date
    """
    df = X_train[
            (X_train['number_sta'] == station_number) &
            (X_train['year'] == year) &
            (X_train['month'] == month) &
            (X_train['day'] == day)
        ]
    return df

## TODO:
- [ ] create a function that checks for a given station the values of the variables over one day, returns a boolean that determines whether there are any missing values (can use the previous function's return value)
- [ ] create a function that interpolates the missing values for a given station using the k nearnest neighbors (k_closest_stations)
- [ ] create a function that interpolates the missing values for all stations using the k nearnest neighbors (k_closest_stations)
