# 🎬Getting familiar with data

In [2]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
print_filler = '=================================\n'

## 🔹Analyse and transform dataset 

#### 💡Load full dataset, describe dataset, feature selection based on relevance, transform columns, explore chosen data with visualizations

In [4]:
file_name = 'Eastern flyway spring migration of adult white storks (data from Rotics et al. 2018).csv'
print(os.path.join('data',file_name))
data = pd.read_csv(os.path.join('data',file_name))
print(data.shape)
print(data.columns)

data\Eastern flyway spring migration of adult white storks (data from Rotics et al. 2018).csv
(2467077, 21)
Index(['event-id', 'visible', 'timestamp', 'location-long', 'location-lat',
       'comments', 'eobs:acceleration-axes',
       'eobs:acceleration-sampling-frequency-per-axis',
       'eobs:accelerations-raw', 'eobs:status', 'eobs:type-of-fix',
       'ground-speed', 'heading', 'height-above-ellipsoid', 'start-timestamp',
       'study-specific-measurement', 'sensor-type',
       'individual-taxon-canonical-name', 'tag-local-identifier',
       'individual-local-identifier', 'study-name'],
      dtype='object')


In [5]:
display(data.head())

Unnamed: 0,event-id,visible,timestamp,location-long,location-lat,comments,eobs:acceleration-axes,eobs:acceleration-sampling-frequency-per-axis,eobs:accelerations-raw,eobs:status,...,ground-speed,heading,height-above-ellipsoid,start-timestamp,study-specific-measurement,sensor-type,individual-taxon-canonical-name,tag-local-identifier,individual-local-identifier,study-name
0,7335935935,True,2012-02-14 02:00:44.000,36.701858,3.985513,,,,,A,...,0.02,358.48,670.0,,,gps,Ciconia ciconia,1787,1787/HH582,Eastern flyway spring migration of adult white...
1,7335935936,True,2012-02-14 02:05:10.000,36.70185,3.985472,,,,,A,...,0.06,349.31,667.9,,,gps,Ciconia ciconia,1787,1787/HH582,Eastern flyway spring migration of adult white...
2,7335935937,True,2012-02-14 02:10:12.000,36.701815,3.985495,,,,,A,...,0.08,0.0,668.8,,,gps,Ciconia ciconia,1787,1787/HH582,Eastern flyway spring migration of adult white...
3,7335935938,True,2012-02-14 02:15:11.000,36.701855,3.985515,,,,,A,...,0.17,9.83,666.7,,,gps,Ciconia ciconia,1787,1787/HH582,Eastern flyway spring migration of adult white...
4,7335935939,True,2012-02-14 02:20:10.000,36.701822,3.985484,,,,,A,...,0.11,336.86,667.6,,,gps,Ciconia ciconia,1787,1787/HH582,Eastern flyway spring migration of adult white...


In [6]:
display(data.describe())

Unnamed: 0,event-id,location-long,location-lat,comments,eobs:acceleration-sampling-frequency-per-axis,eobs:type-of-fix,ground-speed,heading,height-above-ellipsoid,study-specific-measurement,tag-local-identifier
count,2467077.0,1207130.0,1207130.0,1259947.0,1259947.0,1207130.0,1207130.0,1207130.0,1207130.0,2345373.0,2467077.0
mean,7251066000.0,27.06471,27.20805,1.362054,10.54,3.0,3.863259,181.1902,615.8689,36535520.0,2300.97
std,83663750.0,7.560671,18.31143,1.806803,3.07603e-10,0.0,5.991217,152.2742,560.2873,20128130.0,253.6245
min,7168545000.0,10.41956,-30.58608,0.1276,10.54,3.0,0.0,0.0,-2564.8,3546255.0,1787.0
25%,7169162000.0,22.8951,12.63261,0.2808095,10.54,3.0,0.13,15.07,202.4,10644740.0,2306.0
50%,7169779000.0,28.9799,27.06366,0.764474,10.54,3.0,0.3,186.12,442.1,38204030.0,2322.0
75%,7336526000.0,33.01696,43.30978,1.81035,10.54,3.0,7.98,344.72,901.7,55272320.0,2341.0
max,7337143000.0,40.1279,54.1172,27.4271,10.54,3.0,361.62,359.79,59780.0,62384970.0,2859.0


In [7]:
null_counts = data.isnull().sum()
display(null_counts)
rows_without_null = data.dropna().shape[0]
print(f"{print_filler}Number of rows without null values:", rows_without_null)

event-id                                               0
visible                                                0
timestamp                                              0
location-long                                    1259947
location-lat                                     1259947
comments                                         1207130
eobs:acceleration-axes                           1207130
eobs:acceleration-sampling-frequency-per-axis    1207130
eobs:accelerations-raw                           1207130
eobs:status                                      1259947
eobs:type-of-fix                                 1259947
ground-speed                                     1259947
heading                                          1259947
height-above-ellipsoid                           1259947
start-timestamp                                  1207130
study-specific-measurement                        121704
sensor-type                                            0
individual-taxon-canonical-name

Number of rows without null values: 0


In [8]:
processed_data = data[['tag-local-identifier','timestamp','location-long','location-lat']]
null_counts = processed_data.isnull().sum()
display(null_counts)
rows_without_null = processed_data.dropna().shape[0]
print(f"{print_filler}Number of rows without null values:", rows_without_null)

tag-local-identifier          0
timestamp                     0
location-long           1259947
location-lat            1259947
dtype: int64

Number of rows without null values: 1207130


⚠️ The relevant columns we decide to keep are: `['tag-location-identifier','timestamp','location-long','location-lat']`

⚠️ We observe 1259947 records without latitude and longitude information, since the missing values cannot be approximated and the dataset size is decent in size after dropping those records, we proceed to drop them 

In [41]:
processed_data = processed_data.dropna()
print(processed_data.shape)

(1207130, 4)


In [42]:
processed_data.dtypes

tag-local-identifier      int64
timestamp                object
location-long           float64
location-lat            float64
dtype: object

In [44]:
print(list(processed_data['timestamp'])[0])

2012-02-14 02:00:44.000


In [45]:
processed_data['timestamp']=pd.to_datetime(processed_data['timestamp'])
processed_data.dtypes

tag-local-identifier             int64
timestamp               datetime64[ns]
location-long                  float64
location-lat                   float64
dtype: object

In [55]:
processed_data.to_csv(os.path.join('data','StorkMigration.csv'))

In [58]:
selected_id = list(processed_data['tag-local-identifier'])[0]
selected_data_id = processed_data[processed_data['tag-local-identifier']==selected_id]
print(selected_data_id.shape)
print(selected_data_id['timestamp'])

(19172, 4)
0       2012-02-14 02:00:44
1       2012-02-14 02:05:10
2       2012-02-14 02:10:12
3       2012-02-14 02:15:11
4       2012-02-14 02:20:10
                ...        
19167   2013-04-21 19:40:16
19168   2013-04-21 20:00:13
19169   2013-04-21 20:20:31
19170   2013-04-21 20:40:32
19171   2013-04-21 21:00:12
Name: timestamp, Length: 19172, dtype: datetime64[ns]


In [76]:
for stork in set(processed_data['tag-local-identifier']):
    selected_data_id = processed_data[processed_data['tag-local-identifier']==stork]
    result = selected_data_id.groupby([selected_data_id['timestamp'].dt.year]).count()
    
    display(result)

Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,17498,17498,17498,17498
2013,14010,14010,14010,14010
2014,7917,7917,7917,7917
2015,4613,4613,4613,4613


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,11988,11988,11988,11988
2013,9733,9733,9733,9733
2014,12437,12437,12437,12437


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,10042,10042,10042,10042
2014,10527,10527,10527,10527


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,16172,16172,16172,16172
2013,11179,11179,11179,11179
2014,9956,9956,9956,9956


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,11826,11826,11826,11826
2014,10669,10669,10669,10669
2015,11803,11803,11803,11803


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,13635,13635,13635,13635
2014,12165,12165,12165,12165
2015,13824,13824,13824,13824
2016,12974,12974,12974,12974


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,11294,11294,11294,11294
2014,10781,10781,10781,10781
2015,16328,16328,16328,16328
2016,9578,9578,9578,9578


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,13537,13537,13537,13537
2014,10924,10924,10924,10924
2015,14553,14553,14553,14553


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,12478,12478,12478,12478


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,16372,16372,16372,16372
2014,13449,13449,13449,13449
2015,18790,18790,18790,18790
2016,13023,13023,13023,13023


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,12508,12508,12508,12508
2014,13290,13290,13290,13290
2016,11939,11939,11939,11939


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,16066,16066,16066,16066
2014,22321,22321,22321,22321
2015,13454,13454,13454,13454
2016,15892,15892,15892,15892


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,16761,16761,16761,16761


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,15643,15643,15643,15643
2014,14265,14265,14265,14265


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,15465,15465,15465,15465


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,11701,11701,11701,11701
2015,14598,14598,14598,14598


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,11704,11704,11704,11704
2014,16962,16962,16962,16962
2015,20267,20267,20267,20267
2016,16265,16265,16265,16265


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,14148,14148,14148,14148


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,12475,12475,12475,12475
2014,13034,13034,13034,13034
2015,13002,13002,13002,13002


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,9464,9464,9464,9464


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,12903,12903,12903,12903
2014,12976,12976,12976,12976
2015,13892,13892,13892,13892
2016,12807,12807,12807,12807


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,13248,13248,13248,13248
2014,13459,13459,13459,13459


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,11696,11696,11696,11696
2014,11967,11967,11967,11967
2015,13301,13301,13301,13301


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,14966,14966,14966,14966
2014,10791,10791,10791,10791
2015,15181,15181,15181,15181
2016,16974,16974,16974,16974


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,13569,13569,13569,13569


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,11283,11283,11283,11283
2015,10233,10233,10233,10233


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,14909,14909,14909,14909
2015,13760,13760,13760,13760
2016,14726,14726,14726,14726


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,15874,15874,15874,15874
2013,15386,15386,15386,15386
2014,11718,11718,11718,11718
2015,13254,13254,13254,13254


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,9660,9660,9660,9660


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,11301,11301,11301,11301


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,13947,13947,13947,13947
2015,12487,12487,12487,12487
2016,9210,9210,9210,9210


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2014,19003,19003,19003,19003
2015,13370,13370,13370,13370
2016,11931,11931,11931,11931


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2012,10410,10410,10410,10410
2013,8762,8762,8762,8762


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,15285,15285,15285,15285
2014,12891,12891,12891,12891
2015,15432,15432,15432,15432
2016,16674,16674,16674,16674


Unnamed: 0_level_0,tag-local-identifier,timestamp,location-long,location-lat
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2013,12595,12595,12595,12595


In [70]:
selected_id = set(processed_data['tag-local-identifier'])
print(selected_id)

{1793, 1794, 2305, 1796, 1797, 2306, 2307, 2313, 2317, 2319, 2320, 2322, 2323, 2327, 2840, 2841, 2330, 2331, 2332, 2842, 2334, 2336, 2337, 2341, 2342, 2346, 2859, 1791, 2491, 2492, 2499, 2421, 1787, 2301, 2303}
