## 1. Convert .txt data to .csv

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

Populating the interactive namespace from numpy and matplotlib


In [2]:
columns = ['obs_time', 'id', 'type',
         'obs_umbral_area', 'obs_whole_area', 'corr_umbral_area', 'corr_whole_area',
         'center_dist', 'pos_angle', 'carrington_lon', 'lat', 'center_meridian_dist']

In [3]:
table = pd.DataFrame(columns=columns)
num_lines = np.sum([1 for line in open('datafiles/rgofull.txt', 'r')])
txt_data = open('datafiles/rgofull.txt', 'r')
for line in tqdm(txt_data, total=num_lines):
    if int(line[12:22]) == 0:
        continue
    daytime = float(line[8:12])
    seconds = int(daytime * 3600 * 24)
    hour = seconds // 3600
    seconds -= hour * 3600
    minute = seconds // 60
    seconds -= minute * 60
    obs_time = pd.Timestamp(year=int(line[0:4]),
                            month=int(line[4:6]),
                            day=int(line[6:8]),
                            hour=hour,
                            minute=minute,
                            second=seconds)
    features = [obs_time,
                int(line[12:22]),
                int(line[22:24]),
                float(line[25:29]),
                float(line[30:34]),
                float(line[35:39]),
                float(line[40:44]),
                float(line[45:50]),
                float(line[51:56]),
                float(line[57:62]),
                float(line[63:68]),
                float(line[69:74])]
    series = pd.Series(features, index=columns)
    table = table.append(series, ignore_index=True)

HBox(children=(IntProgress(value=0, max=161413), HTML(value='')))




In [7]:
table.to_csv('datafiles/rgofull.csv', index=False)

## 2. Add missing observations as NaNs

In [None]:
data = pd.read_csv('datafiles/rgofull.csv')
data['obs_time'] = pd.to_datetime(data['obs_time'])

In [None]:
grouped = data.groupby('id')

In [None]:
lifetimes = grouped.obs_time.max().dt.date - grouped.obs_time.min().dt.date
mask = lifetimes.dt.days + 1 != grouped.size()
hidden_spots = mask[mask].index.to_numpy()

In [None]:
for spot_id in tqdm(hidden_spots):
    dates = grouped.get_group(spot_id).obs_time.dt.date
    dates_range = pd.date_range(dates.min(), dates.max())
    missed_dates = dates_range[~np.isin(dates_range, dates.astype('datetime64[ns]'))]
    new_states = pd.DataFrame(columns=data.columns)
    new_states['obs_time'] = missed_dates
    new_states['id'] = spot_id
    data = data.append(new_states, ignore_index=True)

In [None]:
data = data.sort_values(['obs_time', 'id'])

In [None]:
data.to_csv('datafiles/rgofull_nan.csv', index=False)