In [None]:
import pandas as pd
import lzma
import matplotlib.pyplot as plt
import numpy as np

In [None]:
with lzma.open("profiles.csv.xz", 'rt') as file:
    signal_df = pd.read_csv(file)
signal_df['sequenceID'] = signal_df['sequenceID'].str[:-3]
signal_df = signal_df.drop_duplicates(subset=['sequenceID', 'position'])

labels_df = pd.read_csv("labels.csv")
labels_df['sequenceID'] = labels_df['sequenceID'].str[:-3]
labels_df['label_length'] = labels_df['end'] - labels_df['start']

In [None]:
list_seqID = sorted(signal_df['sequenceID'].unique())

In [None]:
stat_df = signal_df.groupby('sequenceID')['position'].nunique().reset_index()
stat_df.columns = ['sequenceID', 'length']

In [None]:
neg_mean_label_length = labels_df[labels_df['changes'] == 0].groupby('sequenceID')['label_length'].mean()
pos_mean_label_length = labels_df[labels_df['changes'] == 1].groupby('sequenceID')['label_length'].mean()

neg_mean_label_length = neg_mean_label_length.rename('avg_neg_label_length')
pos_mean_label_length = pos_mean_label_length.rename('avg_pos_label_length')

In [None]:
mean_label_length_df = pd.merge(neg_mean_label_length, pos_mean_label_length, on='sequenceID')
stat_df = pd.merge(stat_df, mean_label_length_df, on='sequenceID')

In [None]:
plt.scatter(np.log10(stat_df['length']), np.log10(stat_df['avg_neg_label_length']), color='black')
plt.scatter(np.log10(stat_df['length']), np.log10(stat_df['avg_pos_label_length']), color='black')

# Add labels and title
plt.xlabel('Log10 of Length')
plt.ylabel('Log10 of Label Length')
plt.title('Length vs. Label Length')
plt.savefig("figures/analyze/length_vs_label_length.png")