-
Notifications
You must be signed in to change notification settings - Fork 0
/
proc-clustag-01.py
41 lines (30 loc) · 930 Bytes
/
proc-clustag-01.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
"""
Post-process the output file (*.members.txt) created by Clustag software.
Get sizes of clusters.
Author: Gennady Khvorykh, info@inzilico.com
Created: June 18, 2024
"""
import sys
import functions
import pandas as pd
from matplotlib import pyplot as plt
# Get command line arguments
input_file = sys.argv[1]
output_prefix = sys.argv[2]
# Check input
functions.check_files([input_file])
# Load input
d1 = pd.read_csv(input_file, sep="\t")
# Initiate empty data frame
out = pd.DataFrame(columns = ["size"])
# Get sizes of clusters
out["size"] = d1.groupby("Cluster").size()
print("# of clusters:", out.shape[0])
# Save
out.to_csv(output_prefix + ".tab", sep = "\t", index_label="cluster")
# Save quantiles
with open(output_prefix + ".q", "w") as f:
print(out["size"].quantile(q=[0.25, 0.5, 0.75]), file=f)
# Plot the boxplot of cluster sizes
plt.boxplot(out["size"])
plt.savefig(output_prefix + ".sizes.png")