In [None]:
REBS
# generate df because REBS appears to be dict to be a bit special or non-dict

import pandas as pd

rebs_df2 = pd.read_csv('C:/Users/Johan/opg_RR/out/0001_public.txt',
                 header=None, engine='python',
                 sep='t=(\d+), (\w+), (\w+), (.*)').dropna(how='all', axis=1) # regex from rebel_decode.py
rebs_df2.columns=['t', 'msg_type', 'reb', 'msg_content']
rebs_df2 = rebs_df2.sort_values('reb')
rebs_df2.reset_index(drop=True)

rebs_df2.value_counts('msg_type') # what type of leaks are most common?
# can be run by two factors:
# travellers leaking cotraveller also leak more often, or
# there are more travellers that leak cotraveller than travellers leaking other msg_types

# restructure msg long to wide 
rebs_df3=rebs_df2.loc[rebs_df2['msg_type'].isin(['COT','NEA'])].pivot(index=['t', 'reb'],
                        columns='msg_type').reset_index() # (location coordinates in parenthesis fails to be pivotted)
rebs_df2.loc[rebs_df2['msg_type'] == 'LOC','msg_content'] = rebs_df2.loc[rebs_df2['msg_type'] == 'LOC','msg_content'].str.replace(r'\(','',regex=True).copy().str.replace(r'\)','',regex=True).copy()
rebs_df2[['x','y','z']] = rebs_df2.loc[rebs_df2['msg_type'] == 'LOC','msg_content'].str.split(pat=',',regex=True,expand=True).copy()
rebs_df4 = rebs_df2[rebs_df2['msg_type'] == 'LOC']
rebs_df4 = rebs_df4[['t','reb','x','y','z']]
rebs_df3.columns = ['_'.join(col_X).rstrip('_') for col_X in rebs_df3.columns.values]
rebs_df5 = pd.concat([rebs_df3,rebs_df4],keys=['t','reb'], ignore_index=True).sort_values(['t','reb']).reset_index(level=None,drop=True) #, drop=True)#.droplevel([0,1]) #['t']
rebs_df5



# data types
rebs_df5.convert_dtypes()
rebs_df5[['x','y','z']] = rebs_df5[['x','y','z']].astype(float)

# expand df size to mission statement of 1000*reb rows
rebs_df6 = rebs_df5.set_index('t')\
            .groupby('reb')\
            .apply(lambda df_x: df_x.reindex(range(1, 1000+1)))\
            .drop('reb', axis=1).reset_index()

# inspect
rebs_df6.describe(include='all')



# From mission statement we know with certainty where some rebels are. 
# We also know with certainty that some rebels travel together at all times.
# We can tie the rebels together using the names and the leaked cotraveller
# names (including who leaked the names). This may enable us to impute values. 

import networkx as nx

rebs_df6_sub_no_na = rebs_df6.dropna(subset=['msg_content_COT']) # we dont want NaNs in the network
relations = nx.from_pandas_edgelist(rebs_df6_sub_no_na, source='reb', target='msg_content_COT')

# figure out the number of ships based on ties in the relations with 
# connected_components, and assign shipnumber to names with enumerate.
ships = {r: t for t, s in enumerate((s for s in nx.connected_components(relations)\
                                      if len(s)>1), start=1) for r in s}

pd.Series(ships) # list all passengers of each ship in the graded assignment

# Add ships to df
rebs_df6['ship'] = rebs_df6['reb'].map(ships)
rebs_df6.describe(include='all')
rebs_df6.value_counts('ship')
rebs_df6['ship'].isna().sum() # all rebels are aboard a ship

# Because we now the ships that the rebels aboard, there is not
# additional information in  msg_content_COT: we drop it.
rebs_df7 = rebs_df6.drop('msg_content_COT', axis=1)

# Knowing the ships of all rebels, we can impute known x,y,z coordinates
# of some rebels at a given time t to cotravellers on the same ship.
# i.e. join on time and ship number to get LOC with certainty.
rebs_df4['ship'] = rebs_df4['reb'].map(ships) # add ships to list of rebs with LOC leaks
rebs_df8 = rebs_df7[['reb','t','ship','msg_content_NEA']].merge(rebs_df4, on=['t','ship'], how='left', suffixes=('_left','_right'))
rebs_df8[['x','y','z']] = rebs_df8[['x','y','z']].astype(float)
test = rebs_df8[rebs_df8.duplicated(subset=['t','reb_left','ship'], keep='first')] # 143 dups because merge not on rebels
test2 = rebs_df8[rebs_df8.duplicated(subset=['t','reb_left','ship'], keep='last')]
# rebs_df8.describe(include='all') # 143 dups
test
test2
# rebs_df8[rebs_df8.duplicated(subset=['t','reb_right','ship'], keep='first')]


rebs_df7[rebs_df7.duplicated()]

# # fundamentally wrong wrong approach--to drop dups on merge right df--because 
# rebs_df8 = rebs_df7.merge(rebs_df4_2.drop_duplicates(subset=['t','ship']), on=['t','ship'], how='left', suffixes=('_left','_right')) 
# # drop dups afterwards: warning: are we dropping the correct rows?
# rebs_df8 = rebs_df7.merge(rebs_df4_2, on=['t','ship'], how='left', suffixes=('_left','_right')).drop_duplicates(subset=['t','reb','ship'])




In [None]:
# edit

# Knowing the ships of all rebels, we can impute known x,y,z coordinates
# of some rebels on some ships at a given time to cotravellers on the same ship.
# I.e. join on time and ship to get certain LOCs.
rebs_df = rebs_df.merge(LOC, on=['t','ship'], how='left')
rebs_df.describe(include='all')
rebs_df['duplicate'] = rebs_df.duplicated(subset=['t','messenger_x','ship'])

# it turns out that, at least in 0001_public and 0001_truth, some rebels do not
# leak totally similar information about their positions at a given time.
# E.g. Yolanda and Steve are in the same ship, and yet at T443 they are located
# somewhat differently according to public information, but not according to truth.
# The leaked signals are either false (biased) or erronous.

# We already know some coordinates with certainty
LOC.describe(include='all')

# approach to calculating the analytical function

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Load the dataset into a pandas DataFrame
data = pd.read_csv('mydata.csv')

# Filter the relevant columns and clean any missing or invalid data
cols_to_keep = ['t', 'dimension1', 'dimension2', 'rate']
data = data[cols_to_keep].dropna()

# Group the data by time and the other dimensions that vary with time
grouped_data = data.groupby(['t', 'dimension1', 'dimension2'])

# Compute the number of non-NaN values for each group
non_nan_counts = grouped_data['column_of_interest'].count()

# Compute the total number of values (including NaN) for each group
total_counts = grouped_data.size()

# Calculate the rate of non-NaN values as the ratio of step 4 to step 5
rates = non_nan_counts / total_counts

# Compute the rates of observations for each group
# rates = grouped_data['rate'].mean()

# Visualize the rates of observations over time
plt.plot(rates)
plt.xlabel('Time')
plt.ylabel('Rate')
plt.show()

# Fit a suitable analytical function to the rates of observations using regression analysis
from scipy.optimize import curve_fit

def analytical_function(t, a, b, c):
    return a *




In [None]:

"""_summary_

It turns out that, at least in 0001_public and 0001_truth, some rebels do not
leak identical information about their positions at a given time; resolution varies.
E.g. Yolanda and Steve are in the same ship, and yet at T443 they are located
somewhat differently according to public information, but not according to truth.
The leaked signals are either false (biased) or erronous.

# impute averages of leaked coordinates per ship at a given time on the rest
# LOC['ship'] = LOC['messenger'].map(ships)
# LOC.describe(include='all') # do we have some coordinates of all ships?
ship_LOC_avg = LOC.groupby(['t','ship'], as_index=False).mean(numeric_only=True)[['t','ship','x','y','z']]
ship_LOC_avg.columns=['t', 'ship', 'x_avg', 'y_avg','z_avg']
rebs_df = rebs_df.merge(ship_LOC_avg, how='left',on=['ship','t'])
rebs_df['x','y','z'] = rebs_df['x','y','z'].fillna(rebs_df['x_avg','y_avg','z_avg'])
rebs_df
rebs_df.describe(include='all')
# perhaps forward fill NaN with some regression between values
"""


In [None]:
# Can we treat leaked positions (and perhaps NEA) as samples of the truth?
# df_rebs['x','x_truth'].hist(by=df['msg_type']) # or perhaps t

# If we may, then perhaps we can 
# Do the distribution of the sample vary from the truth? => then not missing completely at random



    # if true ship movements are really not completely random
    # then it informs how we can handle (impute) missing values
    # of columns predicting about movements.
    # mis complete rand: mean, median, mode, etc
    # mis at rand: multiple imput, regress imput
    # miss not at rand: pattern substitution, maxumum likelihood estimation




# df_rebs['x','x_truth'].hist(by=df['msg_type']) # or perhaps t


# add leaked coordinates
# rebs_df = pd.merge(rebs_df,LOC[['messenger','t','x','y','z']], how='left',on=['messenger','t'])
    # must related to coordinates of all shipmembers, but we dont know how
    # bias and error: what function maps signals from truth?
    # If the missingness of the data can be explained by confounders/variables we observe
    # then may assume the leaked coordinates are missing at random.
    # If the distribution of xyz is likely to be similar for 
    # 1) rebel and t and 2) msg_type(!)


# # https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.interpolate.html
# plot timeseries ts['t','x'] 
# ts.plot()
# ts.interpolate().plot() # https://pandas.pydata.org/docs/user_guide/missing_data.html

# # test different methods
# methods = ["linear", "quadratic", "cubic"] # interpolatiom AKIMA for smooth? 
# df = pd.DataFrame({m: ser.interpolate(method=m) for m in methods})
# df.plot()

In [None]:
# Compare distributions to infer missing at random or completely at random .
# We might compare distributions of positions at NEA leaks--which have no error--
# and where position is equal to true position, with distributions of all positions;
# because that might show that 

# messages = truth.get_messages()
# messages['ship'] = messages['shipid'].apply(lambda shipid_x: int(shipid_x.split('_')[1])) # split, tak the last item, to int
# messages.rename({'x': 'x_truth', 'y': 'y_truth', 'z': 'z_truth'}, axis=1, inplace=True)
# messages = messages[['t','x_truth','y_truth','z_truth','ship','msg']]
# # sample_true = messages.loc[messages['msg'] == 'NEA']
# rebs_df_wtruth.describe()
# # messages



In [None]:
# compare distribution of leaked positions by ship with true positions by ship, at t
# so understand a bit better the error/bias, perhaps so as to impute on an educated guess
# (and by implication, without multiple imputation)


In [None]:

# We may also get a sense of the distributions, and therefore 
# their probabilities, of the respective dimensions, 
# unconditionally of factors necesarry to define rebel movement.
ship_movements.z_truth.hist()
ship_movements.x_truth.hist()
ship_movements.y_truth.hist()
ship_movements[['x_truth', 'y_truth', 'z_truth']].hist()
print('z_skew', ship_movements.z_truth.skew())
print('x_skew', ship_movements.x_truth.skew())
print('y_skew', ship_movements.y_truth.skew())
print('z_kurt', ship_movements.z_truth.kurt())
print('x_kurt', ship_movements.x_truth.kurt())
print('y_kurt', ship_movements.y_truth.kurt())

from scipy.stats import shapiro
test_stat, p_value = shapiro(ship_movements[['x_truth','y_truth','z_truth']]) # joint distribution not normal
print('Shapiro-Wilk test statistic:', test_stat,', ', 'p-value:',p_value)
from scipy.stats import entropy
print(entropy(ship_movements['x_truth']))
print(entropy(ship_movements['y_truth']))
print(entropy(ship_movements['z_truth']))