### This notebook attempted to generate 500 millions simulated weather balloon data

In [1]:
from datetime import datetime, timedelta
import pandas as pd
import numpy as np
import random
from tqdm import tqdm
from tqdm.notebook import tqdm_notebook

import logging

import matplotlib.pyplot as plt


#### Next, we define some basic settings for each field we want to generated, this would effect our output data:
#### Set number of samples N_SAMPLES=500,000,000
#### Set temperature field: in range(-273, 100), and variance of temperature
#### Set location (latitude, longitude): latitude in range(-90, 90), longitude in range(-180, 180), and setup variance in each of them
#### Set observatory site: country code for 245 countries https://developers.google.cn/public-data/docs/canonical/countries_csv

In [2]:
%%time

# Each field are generate independently

N_SAMPLES=500000000

# Timestamp configuration
current_time="1902-01-01 00:00:00"
current_time = datetime.strptime(current_time, "%Y-%m-%d %H:%M:%S")
print("Initial starting timestamp defined.")

# Temperature configuration
init_tmp = random.randint(-273, 300)
variance_temp = 20

# Location configuration
long_max, long_min = 180, -180
lat_max, lat_min = 90, -90
variance_long = 10
variance_lat = 5
init_long = random.random()*( long_min + (long_max - long_min) )
init_lat = random.random()* ( lat_min + (lat_max - lat_min) )
long_lat_current = [init_long, init_lat] # Initial location

print("Location settings defined.")

# Observatory configuration
code = pd.read_csv("country_code.csv") # Contains 245 country codes
code = code["code"].dropna()
code.replace("GB", "UK", inplace = True)
print("Country code defined.")


Initial starting timestamp defined.
Location settings defined.
Country code defined.
CPU times: user 12.7 ms, sys: 1.22 ms, total: 13.9 ms
Wall time: 335 ms


#### Setup missing probability for timestamp
#### Generate probability of missing for timestamp by random(), and create the expected generated samples by N_SAMPLES*(1 + missing_probability)

In [4]:
missing_proba = round(random.uniform(0, 0.3), 2)

n_timestamp = N_SAMPLES*(1 + missing_proba)
n_timestamp = int(n_timestamp)

print("Missing probability for timestamp: ", missing_proba)
print("Total number of timestamp we generate:", n_timestamp)

Missing probability for timestamp:  0.25
Total number of timestamp we generate: 625000000


In [3]:
import gc
gc.collect()


0

In [7]:

timestamp_samples = [current_time + timedelta(minutes=mi) for mi in range(n_timestamp)]

print("Generate {} timestamp finished.".format(n_timestamp))

Generate 625000000 timestamp finished.


In [9]:

with open("timestamp_sample.txt", "w") as file:
    for row in timestamp_samples:
        sample = row.strftime("%Y-%m-%d %H:%M:%S")
        file.write(sample + "\n")

print("Write timestamp data to txt file.")

Write timestamp data to txt file.


In [None]:
%%time

with open("./generated_data/timestamp_sample.txt", "r") as file:
    timestamp_samples = file.read()
    
print("Check timestamp data generated.")

timestamp_samples[0:5]

#### Convert timestamp data to list, and random select 500,000,000 samples from it

In [17]:
%%time
timestamp_samples = list(timestamp_samples)

CPU times: user 39min 38s, sys: 2min 8s, total: 41min 47s
Wall time: 43min 8s


In [None]:
%%time

timestamp_data = random.choices(timestamp_samples, k=N_SAMPLES)

print(timestamp_data[0:5])

#### Generate around 2 millions of samples for each observatory site, save as country_list.csv

In [3]:
#%%time 

country_df = pd.read_csv("country_list.csv")

country_df.shape

(500000000, 1)

In [8]:
country_df.head()

Unnamed: 0,country_code
0,BG
1,TT
2,KR
3,TP
4,SE


#### Display number of samples for each observatory site

In [4]:
%%time

country_cnt = country_df.groupby(['country_code']).size()
country_cnt

CPU times: user 25.2 s, sys: 14.7 s, total: 39.9 s
Wall time: 46.2 s


country_code
AD    2041896
AE    2042104
AF    2041586
AG    2039300
AI    2039408
       ...   
YE    2040929
YT    2042763
ZA    2040002
ZM    2040939
ZW    2040766
Length: 245, dtype: int64

#### Generate simulated data: temperature & location
#### We simulate data in each observatory site, the weather balloon scan around an area (latitude, longitude) and detects temperature, thus we generate random value with an initial value and a random variant
#### First, generate temperature around each site

In [43]:
%%time

temp_variance = 3

temp_values = {}

for country in list(country_cnt.index):
    init_temp = random.randint(-273, 100)
    country_temp = np.random.normal(init_temp, 0.1, country_cnt[country])

    temp_values[country] = country_temp
    gc.collect()



CPU times: user 23.3 s, sys: 964 ms, total: 24.3 s
Wall time: 24.3 s


In [50]:
x = temp_values['AD'].reshape(len(temp_values['AD']), 1)
x.shape

(2041896, 1)

#### Save temperature data to composite dataframe

In [91]:
%%time


country_df["temp"] = None

for i in tqdm(range(len(country_cnt))):
    country = country_cnt.index[i]

    country_df.loc[country_df["country_code"] == country, "temp"] = temp_values[country]
    

gc.collect()

100%|██████████████████████████████████████████████████████████| 245/245 [1:36:27<00:00, 23.62s/it]

CPU times: user 1h 33min 21s, sys: 2min 21s, total: 1h 35min 43s
Wall time: 1h 36min 38s





23

In [92]:
%%time

from tqdm import tqdm
# Save data to file first after a period

country_df.to_csv("country_df_temp.csv", index = False)

CPU times: user 15min 1s, sys: 1min 6s, total: 16min 8s
Wall time: 17min 8s


In [3]:
country_df = pd.read_csv("country_df_temp.csv")
country_df = country_df["country_code"]

#### Next, generate longitude for each observatory site
#### For each site, generate random initial longitude coordinate, add random variant to create simulated samples

In [7]:
%%time

# location

country_location_long = {}

for country in tqdm(list(country_cnt.index)):
    # Random initial location for each country
    init_long = random.random()*( long_min + (long_max - long_min) )
    samples_long = np.random.normal(init_long, variance_long, country_cnt[country])

    
    country_location_long[country] = samples_long
    gc.collect()
    

100%|████████████████████████████████████████████████████████████| 245/245 [00:23<00:00, 10.58it/s]

CPU times: user 22.3 s, sys: 867 ms, total: 23.2 s
Wall time: 23.2 s





#### Next, generate latitude data for each site

In [9]:
country_location_lat = {}

for country in tqdm(list(country_cnt.index)):
    # Random initial location for each country
    init_lat = random.random()*( lat_min + (lat_max - lat_min) )
    samples_lat = np.random.normal(init_lat, variance_lat, country_cnt[country])

    
    country_location_lat[country] = samples_lat
    gc.collect()

100%|████████████████████████████████████████████████████████████| 245/245 [00:22<00:00, 10.71it/s]


#### After generate the simulated location, save data to composite dataframe
#### Save longitude data to dataframe

In [11]:
%%time

country_df["longitude"] = None

for i in tqdm(range(len(country_cnt))):
    country = country_cnt.index[i]
    country_df.loc[country_df["country_code"] == country, "longitude"] = country_location_long[country]
    
    gc.collect()

100%|██████████████████████████████████████████████████████████| 245/245 [1:55:38<00:00, 28.32s/it]

CPU times: user 1h 36min 27s, sys: 12min 26s, total: 1h 48min 54s
Wall time: 1h 55min 43s





In [12]:
country_df.head()

Unnamed: 0,country_code,longitude
0,BG,130.494
1,TT,181.89
2,KR,29.1778
3,TP,108.327
4,SE,119.197


In [13]:
%%time

country_df["longitude"].to_csv("country_df_longitude.csv", index = False)



CPU times: user 14min 34s, sys: 2min 54s, total: 17min 28s
Wall time: 19min 46s


#### Save latitude data to dataframe

In [15]:
%%time

country_df["latitude"] = None

for i in tqdm(range(len(country_cnt))):
    country = country_cnt.index[i]
    country_df.loc[country_df["country_code"] == country, "latitude"] = country_location_lat[country]
    
    gc.collect()
    
    

100%|██████████████████████████████████████████████████████████| 245/245 [1:36:30<00:00, 23.63s/it]

CPU times: user 1h 29min 57s, sys: 4min 16s, total: 1h 34min 13s
Wall time: 1h 36min 33s





In [16]:
country_df.head()

Unnamed: 0,country_code,longitude,latitude
0,BG,130.494,28.0908
1,TT,181.89,30.8252
2,KR,29.1778,76.6717
3,TP,108.327,51.6136
4,SE,119.197,12.0121


In [17]:
%%time

country_df["latitude"].to_csv("country_df_latitude.csv", index = False)


CPU times: user 14min 9s, sys: 2min 18s, total: 16min 27s
Wall time: 18min 5s


In [1]:
%%time

import pandas as pd
import random

# Get N_samples from timestamp

timestamp = pd.read_csv("timestamp_sample.txt", header=None)

timestamp.head()



CPU times: user 4min 41s, sys: 5min 12s, total: 9min 54s
Wall time: 13min 27s


Unnamed: 0,0
0,1902-01-01 00:00:00
1,1902-01-01 00:01:00
2,1902-01-01 00:02:00
3,1902-01-01 00:03:00
4,1902-01-01 00:04:00


### Finally we get the generated data: 
#### timestamp_sample.txt
#### country_df_latitude.csv
#### country_df_longitude.csv
#### country_df_temp.csv (with country code and temperature)
### We would randomly insert NULL values into fields other then timestamp to simulate the reliability of data collection, which would be done in database server