In [None]:
#general packages
import pandas as pd
import numpy as np
from collections import Counter
import tifffile as tf
from skimage.measure import regionprops
#plotting packages
import matplotlib.pyplot as plt
import seaborn as sns
#custom function
from post_analysis import *
%config InlineBackend.figure_format='retina'

In [None]:
#for across channel
mtx = pd.read_csv("/groups/CaiLab/personal/Lex/raw/250113_mb_BSpeg_xtra_potentialTriton/pyfish_tools/output/genebycell/final_1.01.41.4_seed33_heg_svm_p20.0_diff0_fdr5.0/genebycell_1.csv", index_col=0)

In [None]:
#take a look
mtx

In [None]:
#codebook
codebook = pd.read_csv(f"/groups/CaiLab/personal/Lex/raw/250113_mb_BSpeg_xtra_potentialTriton/barcode_key/codebook_string_488.csv", index_col=0)
#separate into true and false codebook
fakebook = codebook[codebook.index.str.startswith("fake")]
codebook = codebook.drop(fakebook.index)

In [None]:
#calculate fdr
fp, fake = percent_false_positive(mtx, codebook, fakebook)
percent_fp = fp["FP raw"].mean()
mean_counts = fp["total_real"].mean()
sum_counts = fp["total_counts"].sum()
norm_fpr = fp["FDR"].mean()
fp_list = [percent_fp,norm_fpr,mean_counts,sum_counts]

In [None]:
#take a look at fdr results
df_stats = pd.DataFrame(fp_list).T
df_stats.columns = ["percent fp","false positive rate","mean true counts", "total sum"]
df_stats

# Efficiency and correlations (if applicable)

In [None]:
#read in rnaseq data
rnaseq = pd.read_csv("/groups/CaiLab/personal/Lex/raw/250113_mb_BSpeg_xtra_potentialTriton/mouse_brain_extra/kallisto_mousebrain.csv")
rnaseq.columns = ["Genes","TPM"]

In [None]:
#convert data to pseudobulk rnaseq data
bulk = pd.DataFrame(mtx.mean(axis=1)).reset_index()
bulk.columns = ["Genes", "Counts"]
bulk["Genes"] = bulk["Genes"].str.lower()
rnaseq["Genes"] = rnaseq["Genes"].str.lower()
#merge
comb_1 = pd.merge(rnaseq,bulk)
#pearson's correlation
r = pearsonr(comb_1["TPM"],comb_1["Counts"])
r = round(r[0],2)

In [None]:
#get log2 + 1
comb_1["Log Counts"] = np.log10(comb_1["Counts"]+0.1)
comb_1["Log TPM"] = np.log10(comb_1["TPM"]+0.1)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import gaussian_kde
from matplotlib import ticker
from matplotlib.font_manager import FontProperties

# Set the style
sns.set_style("white")

# Calculate point density
xy = np.vstack([comb_1["Log TPM"], comb_1["Log Counts"]])
z = gaussian_kde(xy)(xy)  # Compute the density for each point

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = comb_1["Log TPM"][idx], comb_1["Log Counts"][idx], z[idx]

# Determine the limits with padding
x_min, x_max = x.min(), x.max()
y_min, y_max = y.min(), y.max()

# Add padding (e.g., 10% of the range)
padding = 0.1 * max(x_max - x_min, y_max - y_min)
x_lim = (x_min - padding, x_max + padding)
y_lim = (y_min - padding, y_max + padding)

# Create the scatter plot with density as color
plt.figure(figsize=(8, 6))
scatter = plt.scatter(x.values, y.values, c=z, s=50, edgecolor='k', alpha=0.7)

# Add color bar for density
cbar = plt.colorbar(scatter)
cbar.set_label('Density', fontweight='bold', fontsize=16)

# Create FontProperties object for bold font
bold_font = FontProperties(weight='bold', size=12)

# Apply bold font to color bar ticks
cbar.ax.yaxis.set_tick_params(labelsize=12)
for label in cbar.ax.get_yticklabels():
    label.set_fontproperties(bold_font)

# Labels and title
plt.xlabel("Bulk RNAseq Log2(TPM+0.1)", fontsize=16, fontweight='bold')
plt.ylabel("Pseudobulk Log2(Counts+0.1)", fontsize=16, fontweight='bold')

# Set axis ticks to bold
plt.xticks(fontsize=12, fontweight='bold')
plt.yticks(fontsize=12, fontweight='bold')

# Set axis limits with padding
plt.xlim(x_lim)
plt.ylim(y_lim)

# Draw lines at x=0 and y=0
plt.axhline(0, color='black', linewidth=1.5, linestyle='--', alpha = 0.5)
plt.axvline(0, color='black', linewidth=1.5, linestyle='--', alpha = 0.5)

# Annotate in the top-left corner with bold font
plt.annotate(
    f"Pearson's r= {r}",
    xy=(x_lim[0], y_lim[1]),
    xytext=(5, -5),
    textcoords='offset points',
    fontsize=16,
    fontweight='bold',
    ha='left',
    va='top'
)

# Remove the spines for a cleaner look
sns.despine()

plt.savefig("RNAseq_vs_smfish.svg", format="svg")

# Show the plot
plt.show()

In [None]:
#read in other reference files
mtx_den1 = pd.read_csv("/path/to/den/mtx1", index_col=0)
mtx_den2 = pd.read_csv("/path/to/den/mtx2", index_col=0)

In [None]:
correlation(mtx_den1,mtx_den2, label_x="", label_y="",
            title="", return_comb_df=False)