In [2]:
import pandas as pd

In [3]:
data = pd.read_csv('/pmglocal/ty2514/Enhancer/Enhancer/data/input_data_raw.csv', sep = ',')
print(f'Length of raw dataset: {len(data)}')
print(f'Columns in the raw dataset: {data.columns}')

Length of raw dataset: 28800
Columns in the raw dataset: Index(['Rep', 'Id', 'Pos1', 'Pos2', 'Pos3', 'G-', 'G-_pi', 'G-_sigma',
       'G-_std', 'G-_pi_std', 'G-_sigma_std', 'G+', 'G+_pi', 'G+_sigma',
       'G+_std', 'G+_pi_std', 'G+_sigma_std', 'fragment_ids', 'sequence'],
      dtype='object')


# Create Merged Data

In [4]:
# Step 1: Filter out samples where 'G+_std' or 'G-_std' is greater than 0.1
filtered_data = data[(data['G+_std'] <= 0.1) & (data['G-_std'] <= 0.1)]
print(len(filtered_data))
# Step 2: Group by unique sequence ('sequence' and 'fragment_ids' as identifiers) and aggregate
# Columns to aggregate: 'G-', 'G+', 'G+_std', 'G-_std'
# Columns to keep: 'Pos1', 'Pos2', 'Pos3', 'sequence', 'fragment_ids'

# Define the aggregation logic for each column
aggregation = {
    'G-': 'mean', 
    'G+': 'mean', 
    'G+_std': 'mean', 
    'G-_std': 'mean', 
    'Pos1': 'first', 
    'Pos2': 'first', 
    'Pos3': 'first',
    'sequence': 'first', 
    'fragment_ids': 'first'
}

# Perform the aggregation
unique_sequences_df = filtered_data.groupby(['sequence', 'fragment_ids'], as_index=False).agg(aggregation)

# Step 3: Calculate 'GFP' by subtracting 'G-' from 'G+' for each unique sequence
unique_sequences_df['GFP'] = unique_sequences_df['G+'] - unique_sequences_df['G-']

print(f'Length of filtered dataset: {len(unique_sequences_df)}')
print(f'Columns in the filtered dataset: {unique_sequences_df.columns}')

28292
Length of filtered dataset: 14301
Columns in the filtered dataset: Index(['G-', 'G+', 'G+_std', 'G-_std', 'Pos1', 'Pos2', 'Pos3', 'sequence',
       'fragment_ids', 'GFP'],
      dtype='object')


In [5]:
# Display the resulting DataFrame
unique_sequences_df.head()

Unnamed: 0,G-,G+,G+_std,G-_std,Pos1,Pos2,Pos3,sequence,fragment_ids,GFP
0,8.43692,8.720525,0.028755,0.025539,10,10,10,AAAAAAAATCTCTCATATCCTACACATCCTCAGAAGAGCTTCTATG...,10_10_10,0.283605
1,8.4732,8.817645,0.034078,0.042597,10,10,8,AAAAAAAATCTCTCATATCCTACACATCCTCAGAAGAGCTTCTATG...,10_10_08,0.344445
2,8.142875,8.5089,0.0278,0.031862,10,10,22,AAAAAAAATCTCTCATATCCTACACATCCTCAGAAGAGCTTCTATG...,10_10_22,0.366025
3,8.381665,8.46513,0.028409,0.027395,10,10,19,AAAAAAAATCTCTCATATCCTACACATCCTCAGAAGAGCTTCTATG...,10_10_19,0.083465
4,8.644885,8.768005,0.036832,0.030498,10,10,5,AAAAAAAATCTCTCATATCCTACACATCCTCAGAAGAGCTTCTATG...,10_10_05,0.12312


In [6]:
# Manully check if the calculation of average mean and std are correct
data[data['fragment_ids'] == '10_10_10']

Unnamed: 0,Rep,Id,Pos1,Pos2,Pos3,G-,G-_pi,G-_sigma,G-_std,G-_pi_std,G-_sigma_std,G+,G+_pi,G+_sigma,G+_std,G+_pi_std,G+_sigma_std,fragment_ids,sequence
6260,0,6510,10,10,10,8.33074,9.6e-05,1.09881,0.026609,2e-06,0.022896,8.83325,0.000125,1.26817,0.026927,3e-06,0.023331,10_10_10,AAAAAAAATCTCTCATATCCTACACATCCTCAGAAGAGCTTCTATG...
20660,1,6510,10,10,10,8.5431,9.9e-05,0.982973,0.024469,2e-06,0.019877,8.6078,9.1e-05,1.30493,0.030583,2e-06,0.028527,10_10_10,AAAAAAAATCTCTCATATCCTACACATCCTCAGAAGAGCTTCTATG...


In [7]:
# Get IDs of filtered samples
filtered_ids = set(data['fragment_ids']) - set(unique_sequences_df['fragment_ids'])
print(filtered_ids)

# Randomly pick a few to manually double check if the filtering is correct
data[data['fragment_ids'] == '05_11_11']

{'02_01_02', '02_01_24', '24_01_13', '20_02_06', '02_01_15', '17_05_17', '02_01_09', '16_04_08', '04_02_13', '10_12_20', '24_24_06', '08_24_02', '20_02_02', '20_01_23', '08_02_08', '13_02_04', '20_01_20', '07_02_02', '05_11_17', '04_04_05', '05_01_10', '05_24_24', '05_11_01', '13_24_13', '24_02_13', '02_04_06', '16_02_05', '24_24_12', '12_24_02', '05_01_08', '05_11_08', '20_04_04', '13_02_07', '16_09_04', '05_11_11', '16_24_10', '17_02_02', '20_01_05', '24_24_24', '05_04_05', '24_11_02', '02_24_04', '22_01_24', '05_02_17', '05_11_18', '16_02_17', '02_02_08', '05_04_08', '24_01_04', '20_02_24', '17_11_17', '05_01_07', '10_02_05', '24_23_02', '02_02_06', '04_11_04', '08_02_13', '02_24_24', '24_02_20', '24_12_13', '04_02_01', '04_02_20', '20_02_10', '20_11_04', '24_24_11', '05_04_02', '04_02_04', '24_01_23', '02_11_05', '06_04_06', '23_02_23', '12_02_13', '08_04_02', '13_11_17', '24_24_07', '05_09_04', '04_04_04', '02_01_17', '08_24_08', '08_01_08', '04_02_15', '24_24_10', '20_02_13', '05

Unnamed: 0,Rep,Id,Pos1,Pos2,Pos3,G-,G-_pi,G-_sigma,G-_std,G-_pi_std,G-_sigma_std,G+,G+_pi,G+_sigma,G+_std,G+_pi_std,G+_sigma_std,fragment_ids,sequence
6880,0,7155,5,11,11,8.4377,4e-06,0.882101,0.113136,4.77901e-07,0.087392,8.29348,5e-06,1.37431,0.135029,5.04779e-07,0.127579,05_11_11,CCGATTGCTAAGCTGCGGACAATGAGGGAAATGTAGACAAATGTCC...
21280,1,7155,5,11,11,8.92382,4e-06,1.02819,0.129185,4.07121e-07,0.116079,7.70014,2e-06,0.741133,0.13144,2.73638e-07,0.132911,05_11_11,CCGATTGCTAAGCTGCGGACAATGAGGGAAATGTAGACAAATGTCC...


In [8]:
# Calculate descriptive statistics for 'G-', 'G+', and 'GFP'
scale_info = unique_sequences_df[['G-', 'G+', 'GFP']].agg(['mean', 'std', 'min', 'max'])
print(scale_info)

            G-        G+       GFP
mean  8.450228  8.543775  0.093547
std   0.192106  0.244138  0.273139
min   7.096070  7.572620 -1.519000
max   9.452100  9.758660  1.652550


In [None]:
# Save the data
unique_sequences_df.to_csv('/pmglocal/ty2514/Enhancer/Enhancer/data/filtered_merged_data.csv', index = False)

# Create Data For Two Replicates

In [4]:
# Step 1: Filter out samples where 'G+_std' or 'G-_std' is greater than 0.1
filtered_data = data[(data['G+_std'] <= 0.1) & (data['G-_std'] <= 0.1)]
rep1_data = filtered_data[filtered_data['Rep'] == 0]
rep1_data['GFP'] = rep1_data['G+'] - rep1_data['G-']
rep2_data = filtered_data[filtered_data['Rep'] == 1]
rep2_data['GFP'] = rep2_data['G+'] - rep2_data['G-']

print(f'Raw data have {len(data)} samples')
print(f'Confident data have {len(filtered_data)} samples')
print(f'Rep1 data have {len(rep1_data)} samples')
print(f'Rep1 data have {len(rep2_data)} samples')

Raw data have 28800 samples
Confident data have 28292 samples
Rep1 data have 14175 samples
Rep1 data have 14117 samples


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rep1_data['GFP'] = rep1_data['G+'] - rep1_data['G-']
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rep2_data['GFP'] = rep2_data['G+'] - rep2_data['G-']


In [7]:
rep1_data.to_csv('/pmglocal/ty2514/Enhancer/Enhancer/data/filtered_rep1_data.csv', index = False)
rep2_data.to_csv('/pmglocal/ty2514/Enhancer/Enhancer/data/filtered_rep2_data.csv', index = False)