In [40]:
import plotly.express as px
import numpy as np
import pandas as pd

### Analyse préléminaire

1. Importer les données
2. Afficher une description des données (nombre de lignes, nombre de colonnes, type des colonnes, ...)

In [41]:
measurements = pd.read_csv("../data/original_data/station_measurements.csv")
measurements.drop(
    columns=["Unnamed: 0", "dir"], inplace=True
)  # drop index and dir columns
measurements.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,296,297,298,299,300,301,vague,Point,lat,lon
0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-96.0,-100.0,-100.0,-82.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
1,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
2,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
3,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
4,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338


In [42]:
measurements.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,295,296,297,298,299,300,301,vague,lat,lon
count,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,...,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0,11050.0
mean,-99.934932,-99.930769,-99.968597,-100.0,-99.998281,-99.940905,-98.799729,-99.34724,-99.878371,-98.271131,...,-100.0,-100.0,-100.0,-99.976109,-100.0,-100.0,-100.0,0.058824,48.844563,2.373412
std,0.961764,0.954008,0.593514,0.0,0.136201,0.967216,4.537031,3.333414,1.307211,5.610778,...,0.0,0.0,0.0,0.512288,0.0,0.0,0.0,0.235305,0.000113,0.000173
min,-100.0,-101.0,-100.0,-100.0,-100.0,-100.0,-101.0,-100.0,-101.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.0,48.84434,2.37309
25%,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.0,48.84447,2.37326
50%,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.0,48.84456,2.37342
75%,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0.0,48.84465,2.37356
max,-68.0,-65.0,-76.0,-100.0,-87.0,-68.0,-54.0,-52.0,-65.0,-52.0,...,-100.0,-100.0,-100.0,-77.0,-100.0,-100.0,-100.0,1.0,48.84476,2.37374


In [43]:
data_description = measurements.describe()

In [44]:
data_description.iloc[1][1:-3]

1      -99.930769
2      -99.968597
3     -100.000000
4      -99.998281
5      -99.940905
          ...    
297   -100.000000
298    -99.976109
299   -100.000000
300   -100.000000
301   -100.000000
Name: mean, Length: 301, dtype: float64

### Analyse des données

Regardons premièrement la moyenne des RSSI par beacon afin d'éliminer les beacons qui ne sont pas assez proches de la zone d'intérêt, ou qui ne transmettent aucune information (RSSI = -100.0).

In [45]:
fig = px.line(data_description.iloc[1][1:-3],
              title="Mean values of measurements")
fig.show()


Afin de pouvoir éliminer les beacons qui ne consitituent que du bruit, nous utilisons la standard deviation des RSSI par beacon.

In [46]:
fig = px.line(
    data_description.iloc[2][1:-3], title="Standard deviation of measurements"
)
fig.show()


Élaguons le dataset :

In [47]:
std_values = data_description.iloc[2][1:-3]
measurements.drop(columns=std_values[std_values == 0.0].index, inplace=True)
measurements.shape  # reduced from 301 to 124 columns -> 120 sensors left.


(11050, 124)

In [48]:
fig = px.line(
    measurements.describe().iloc[1][1:-3], title="Mean values of measurements"
)
fig.show()


Maintenant, nous pouvons regarder la distribution spatiale des beacons. Pour ce faire, importons les données relatives aux beacons, et une petite transformations nous permet de récupérer les longitudes et latitudes des beacons.

In [49]:
# import beacons positions from the xlsx file
beacons_raw = pd.read_excel("../data/original_data/beacons_pos.xlsx", index_col=0)
beacons_raw = beacons_raw.rename(columns={"Unnamed: 1": "data"})

beacon_values = []
for line in beacons_raw["data"]:
    # remove the first and last curly brackets
    line = line[1:-2]
    line = line.replace('"', "")
    lon, lat, alt, label, mac_address = line.split(",")
    lon = float(lon[3:-1])
    lat = float(lat.split(":")[1])
    alt = float(alt.split(":")[1])
    label = str(label.split(":")[1])

    beacon_values += [[lon, lat, alt, label]]

beacons = pd.DataFrame(beacon_values, columns=["lon", "lat", "alt", "label"])
beacons.shape


(302, 4)

In [50]:
beacons.head()

Unnamed: 0,lon,lat,alt,label
0,2.375836,48.843473,0.0,MD7UHV2
1,2.376041,48.843578,0.0,2MNSKM1
2,2.376325,48.843404,0.0,NPG0662
3,2.37602,48.844217,0.0,VQEBGW1
4,2.375881,48.844107,0.0,J60TU70


In [51]:
# Selet one point, and link it to the beacons that have established a connection with it.
# The point is selected by its index in the measurements dataframe.
point_index = 0
point = measurements.iloc[point_index]
point

0          -100.0
1          -100.0
2          -100.0
4          -100.0
5          -100.0
           ...   
298        -100.0
vague           0
Point          R6
lat      48.84473
lon       2.37338
Name: 0, Length: 124, dtype: object

Nous pouvons déjà séléctionner une mesure afin de regarder la distribution spatiale des beacons par rapport à cette mesure.

In [52]:
point = point[point != -100.0]
beacons_connected = point.index[:-4]
beacons_connected

Index(['6', '9', '10', '61', '62', '63', '66', '68', '69', '70', '71', '96',
       '121', '124', '127', '128', '130', '131', '140', '144', '150', '155',
       '203', '205', '206', '231', '250', '271'],
      dtype='object')

In [53]:
beacons_connected_pos = beacons.iloc[beacons_connected][["lon", "lat"]]


In [54]:
import plotly.express as px

# make the plot square
fig = px.scatter(
    measurements[["lat", "lon"]],
    x="lon",
    y="lat",
    title="Beacons and point positions, with connections obtained in the first measurement",
)
fig.add_scatter(
    x=beacons["lon"],
    y=beacons["lat"],
    mode="markers",
    marker=dict(color="red", size=5, symbol="x"),
    name="Beacons",
    hovertext=beacons["label"],
)
fig.add_scatter(
    x=beacons_connected_pos["lon"],
    y=beacons_connected_pos["lat"],
    mode="markers",
    marker=dict(color="green", size=5, symbol="x"),
    name="Beacons connected",
    hovertext=beacons_connected_pos.index,
)
fig.add_scatter(
    x=[point["lon"]],
    y=[point["lat"]],
    mode="markers",
    marker=dict(color="blue", size=5, symbol="x"),
    name="Point",
    hovertext=point_index,
)
# add a line between the point and the beacons connected to it
# try to set all the lines to one object, to make it easier to remove them
for beacon in beacons_connected_pos.index:
    fig.add_scatter(
        x=[point["lon"], beacons_connected_pos.loc[beacon]["lon"]],
        y=[point["lat"], beacons_connected_pos.loc[beacon]["lat"]],
        mode="lines",
        line=dict(color="black", width=1),
        name="Connection",
    )

fig.update_layout(width=800 * 1.34, height=800)
fig.show()


En utilisant la formule de harvesine, nous pouvons calculer la distance entre chaque beacon connecté et la mesure. Nous pouvons ensuite regarder la distribution des distances par beacon, ce qui nous permet d'étudier une possible corrélation entre la distance et le RSSI.

In [55]:
# using the harvesine formula to calculate the distance between two points, given their coordinates,
# plot the distance between the point and the beacons connected to it
import math

def haversine(lon1, lat1, lon2, lat2):
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(math.radians, [lon1, lat1, lon2, lat2])

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = math.sin(dlat / 2) ** 2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2) ** 2
    c = 2 * math.asin(math.sqrt(a))

    # Radius of earth in kilometers is 6371
    m = 6371 * c * 1000
    return m

distances = []
for beacon in beacons_connected_pos.index:
    distances += [
        haversine(
            point["lon"],
            point["lat"],
            beacons_connected_pos.loc[beacon]["lon"],
            beacons_connected_pos.loc[beacon]["lat"],
        )
    ]

distances[:5]


[9.05928776107142,
 14.876060906977049,
 45.876760165110426,
 16.655408491925517,
 25.713041793852618,
 31.630550894950282,
 10.610722571213774,
 23.795376012803093,
 13.957031742417277,
 35.43679826364865,
 11.751455178941896,
 60.34833332787999,
 56.05181729167326,
 52.66411183390688,
 31.951268302907053,
 26.236190030123808,
 26.74107358057184,
 19.61268732447374,
 178.29293270782122,
 95.31884146029739,
 45.109867651885864,
 127.71830367049097,
 33.05876625925295,
 37.56070175012503,
 41.90366686453676,
 78.38433787492826,
 51.30890166865782,
 24.53864075025448]

In [56]:
# get the RSSI values of the beacons connected to the point in the first measurement (point_index)
rssi_values = point[beacons_connected]

# plot the rssi_values against the distances
fig = px.scatter(
    x=distances,
    y=rssi_values,
    title="RSSI values of the beacons connected to the point in the first measurement",
    labels={"x": "Distance (m)", "y": "RSSI value (dBm)"},
)
fig.show()


In [57]:
# for each measurement in the dataset
for index, point in measurements.iterrows():
    # filter out -100.0 RSSI values (no signal) and get only the connected beacons
    point = point[point != -100.0]

    # convert the beacon labels to integers
    beacons_connected = [
        int(label) for label in point.index[:-4]
    ] # the last 4 columns are not beacons

    # get positions of connected beacons
    beacons_connected_pos = beacons.loc[beacons_connected][["lon", "lat"]]

    # calculate distances and get RSSI values
    distances, rssi_values = [], []
    for b, beacon in enumerate(beacons_connected_pos.index):
        distances.append(
            haversine(
                point["lon"],
                point["lat"],
                beacons_connected_pos.loc[beacon]["lon"],
                beacons_connected_pos.loc[beacon]["lat"],
            )
        )

        # add the RSSI value of the beacon to the list
        rssi_values.append(point[b])

In [58]:
import pandas as pd
import numpy as np

all_distances, all_rssi_values = [], []
# for each measurement in the dataset
for index, point in measurements.iterrows():
    # filter out -100.0 RSSI values (no signal) and get only the connected beacons
    point = point[point != -100.0]

    # convert the beacon labels to integers
    beacons_connected = [
        int(label) for label in point.index[:-4]
    ]  # the last 4 columns are not beacons

    # get positions of connected beacons
    beacons_connected_pos = beacons.loc[beacons_connected][["lon", "lat"]]

    # calculate distances and get RSSI values using the haversine formula
    for b, beacon in enumerate(beacons_connected_pos.index):
        all_distances.append(
            haversine(
                point["lon"],
                point["lat"],
                beacons_connected_pos.loc[beacon]["lon"],
                beacons_connected_pos.loc[beacon]["lat"],
            )
        )
        
        # add the RSSI value of the beacon to the list
        all_rssi_values.append(point[b])

# create a DataFrame from the collected data
df = pd.DataFrame(
    {
        "Distance": np.floor(
            all_distances
        ),  # round down distances to the nearest integer to group them by distance intervals
        "RSSI": all_rssi_values,
    }
)

# group RSSI values by distance intervals and calculate the mean
df_mean = df.groupby("Distance")["RSSI"].mean().reset_index()

# plot the mean RSSI values per distance interval
fig = px.scatter(
    df_mean,
    x="Distance",
    y="RSSI",
    title="Mean RSSI values per distance interval",
    labels={"Distance": "Distance (m)", "RSSI": "Mean RSSI value (dBm)"},
    trendline="ols", # linear regression trendline
)

fig.show()

In [59]:
# group RSSI values by distance intervals and calculate the mean and the number of samples
df_grouped = (
    df.groupby("Distance").agg({"RSSI": ["mean"], "Distance": ["count"]}).reset_index()
)
df_grouped.columns = ["Distance", "RSSI", "Count"]

# plot the mean RSSI values per distance interval and the number of samples
fig = px.scatter(
    df_grouped,
    x="Distance",
    y="RSSI",
    size="Count",
    title="Mean RSSI values per distance interval",
    labels={
        "Distance": "Distance (m)",
        "Mean RSSI": "Mean RSSI value (dBm)",
        "Count": "Number of samples",
    },
    trendline="ols",
     trendline_options=dict(log_x=True) # logarithmic regression trendline (dBm is a logarithmic unit)
)
fig.show()

In [60]:
df.head()


Unnamed: 0,Distance,RSSI
0,9.0,-96.0
1,14.0,-82.0
2,45.0,-76.0
3,16.0,-92.0
4,25.0,-81.0


In [61]:
measurements.head()


Unnamed: 0,0,1,2,4,5,6,7,8,9,10,...,279,280,281,291,292,298,vague,Point,lat,lon
0,-100.0,-100.0,-100.0,-100.0,-100.0,-96.0,-100.0,-100.0,-82.0,-76.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
1,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
2,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
3,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338
4,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-89.0,...,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,0,R6,48.84473,2.37338


In [62]:
beacons.head()


Unnamed: 0,lon,lat,alt,label
0,2.375836,48.843473,0.0,MD7UHV2
1,2.376041,48.843578,0.0,2MNSKM1
2,2.376325,48.843404,0.0,NPG0662
3,2.37602,48.844217,0.0,VQEBGW1
4,2.375881,48.844107,0.0,J60TU70


In [63]:
# for each beacon, calculate distance from the measurement to the beacon and add it as a new column
for col in measurements.columns[:-4]:
    beacon_lat = beacons.loc[int(col), "lat"]
    beacon_lon = beacons.loc[int(col), "lon"]
    measurements[f"distance_to_beacon_{int(col)}"] = measurements.apply(
        lambda row: haversine(row["lat"], row["lon"], beacon_lat, beacon_lon), axis=1
    )

measurements.head()


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Unnamed: 0,0,1,2,4,5,6,7,8,9,10,...,distance_to_beacon_266,distance_to_beacon_267,distance_to_beacon_271,distance_to_beacon_278,distance_to_beacon_279,distance_to_beacon_280,distance_to_beacon_281,distance_to_beacon_291,distance_to_beacon_292,distance_to_beacon_298
0,-100.0,-100.0,-100.0,-100.0,-100.0,-96.0,-100.0,-100.0,-82.0,-76.0,...,185.230023,206.672711,27.862484,167.462752,218.843917,375.826658,341.547482,106.542882,162.846616,111.162804
1,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,185.230023,206.672711,27.862484,167.462752,218.843917,375.826658,341.547482,106.542882,162.846616,111.162804
2,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,185.230023,206.672711,27.862484,167.462752,218.843917,375.826658,341.547482,106.542882,162.846616,111.162804
3,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,185.230023,206.672711,27.862484,167.462752,218.843917,375.826658,341.547482,106.542882,162.846616,111.162804
4,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-89.0,...,185.230023,206.672711,27.862484,167.462752,218.843917,375.826658,341.547482,106.542882,162.846616,111.162804


In [64]:
# split the dataset into train and test sets, and drop unnecessary columns
train_classic_split = measurements.sample(frac=0.8, random_state=0).drop(columns=['vague', 'Point'])
test_classic_split = measurements.drop(train_classic_split.index).drop(columns=['vague', 'Point'])

In [2]:
# using the log-distance path loss model, calculate the distance from the measurement to the beacon
def estimate_distance(rssi, tx_power, n):
    return 10 ** ((tx_power - rssi) / (10 * n))

# just for test
rssi_1 = -100.0
rssi_2 = -50.0
tx_power = 73.9115
n = 7.18812
estimate_distance(rssi_1, tx_power, n), estimate_distance(rssi_2, tx_power, n)

(262.68154305376646, 52.946515928783874)

In [66]:
# for the test set, estimate the distance from the measurement to the beacon
for col in test_classic_split.columns:
    if 'distance' in col:
        # get the beacon number from the column name
        beacon = int(col.split('_')[-1])
        # apply the log-distance path loss model to the RSSI values of the beacon
        test_classic_split[col] = test_classic_split[str(beacon)].apply(lambda rssi: estimate_distance(rssi, tx_power, n))

test_classic_split.head()

Unnamed: 0,0,1,2,4,5,6,7,8,9,10,...,distance_to_beacon_266,distance_to_beacon_267,distance_to_beacon_271,distance_to_beacon_278,distance_to_beacon_279,distance_to_beacon_280,distance_to_beacon_281,distance_to_beacon_291,distance_to_beacon_292,distance_to_beacon_298
0,-100.0,-100.0,-100.0,-100.0,-100.0,-96.0,-100.0,-100.0,-82.0,-76.0,...,262.681543,262.681543,216.749697,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543
13,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543
21,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543
25,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543
26,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,-100.0,...,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543,262.681543


In [67]:
# creating the train and test sets for with a classic split
# train is everything except the lon/lat, test is lon/lat
x_train_classic_split = train_classic_split.drop(columns=["lat", "lon"])
y_train_classic_split = train_classic_split[["lat", "lon"]]
x_test_classic_split = test_classic_split.drop(columns=["lat", "lon"])
y_test_classic_split = test_classic_split[["lat", "lon"]]

# save the train and test sets to csv files
x_train_classic_split = pd.DataFrame(x_train_classic_split)
y_train_classic_split = pd.DataFrame(y_train_classic_split)
x_test_classic_split = pd.DataFrame(x_test_classic_split)
y_test_classic_split = pd.DataFrame(y_test_classic_split)

x_train_classic_split.to_csv("../data/classic_split/train/x_train.csv")
y_train_classic_split.to_csv("../data/classic_split/train/y_train.csv")
x_test_classic_split.to_csv("../data/classic_split/test/x_test.csv")
y_test_classic_split.to_csv("../data/classic_split/test/y_test.csv")

In [68]:
# To make the train and tests set, select 10 unique values from the Point column of the measurements dataframe.
# These 10 values will be the test set, and the rest will be the train set.
measurement_points = measurements["Point"].unique()

train_point_split = measurements[~measurements["Point"].isin(measurement_points[:10])].drop(columns=["vague", "Point"])
test_point_split = measurements[measurements["Point"].isin(measurement_points[:10])].drop(columns=["vague", "Point"])

x_train = train_point_split.drop(columns=["lat", "lon"])
y_train = train_point_split[["lat", "lon"]]

x_test = test_point_split.drop(columns=["lat", "lon"])
y_test = test_point_split[["lat", "lon"]]

# Save the train and test sets to csv files after converting them to pandas dataframes
x_train = pd.DataFrame(x_train)
y_train = pd.DataFrame(y_train)
x_test = pd.DataFrame(x_test)
y_test = pd.DataFrame(y_test)

x_train.to_csv("../data/point_split/train/x_train.csv")
y_train.to_csv("../data/point_split//train/y_train.csv")
x_test.to_csv("../data/point_split/test/x_test.csv")
y_test.to_csv("../data/point_split/test/y_test.csv")

In [69]:
wave_0 = measurements[measurements["vague"] == 0]
wave_1 = measurements[measurements["vague"] == 1]

train_wave_split = wave_0.drop(columns=["vague", "Point"])
test_wave_split = wave_1.drop(columns=["vague", "Point"])

x_train_wave_split = train_wave_split.drop(columns=["lat", "lon"])
y_train_wave_split = train_wave_split[["lat", "lon"]]
x_test_wave_split = test_wave_split.drop(columns=["lat", "lon"])
y_test_wave_split = test_wave_split[["lat", "lon"]]

# Save the train and test sets to csv files after converting them to pandas dataframes
x_train_wave_split = pd.DataFrame(x_train_wave_split)
y_train_wave_split = pd.DataFrame(y_train_wave_split)
x_test_wave_split = pd.DataFrame(x_test_wave_split)
y_test_wave_split = pd.DataFrame(y_test_wave_split)

x_train_wave_split.to_csv("../data/wave_split/train/x_train.csv")
y_train_wave_split.to_csv("../data/wave_split/train/y_train.csv")
x_test_wave_split.to_csv("../data/wave_split/test/x_test.csv")
y_test_wave_split.to_csv("../data/wave_split/test/y_test_.csv")

In [70]:
train_classic_split = measurements.sample(frac=0.8, random_state=0).drop(columns=['vague', 'Point'])
test_classic_split = measurements.drop(train_classic_split.index).drop(columns=['vague', 'Point'])

x_train_classic_split = train_classic_split.drop(columns=["lat", "lon"])
y_train_classic_split = train_classic_split[["lat", "lon"]]
x_test_classic_split = test_classic_split.drop(columns=["lat", "lon"])
y_test_classic_split = test_classic_split[["lat", "lon"]]

# Save the train and test sets to csv files after converting them to pandas dataframes
x_train_classic_split = pd.DataFrame(x_train_classic_split)
y_train_classic_split = pd.DataFrame(y_train_classic_split)
x_test_classic_split = pd.DataFrame(x_test_classic_split)
y_test_classic_split = pd.DataFrame(y_test_classic_split)

x_train_classic_split.to_csv("../data/classic_split/train/x_train.csv")
y_train_classic_split.to_csv("../data/classic_split/train/y_train.csv")
x_test_classic_split.to_csv("../data/classic_split/test/x_test.csv")
y_test_classic_split.to_csv("../data/classic_split/test/y_test.csv")