In [1]:
import time
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from matplotlib import cm
from collections import OrderedDict
import matplotlib as mpl
import matplotlib.colors as colors
import re

import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting
import seaborn as sns

pd.set_option('display.max_rows', 100)

bokeh.io.output_notebook

import scipy.stats as sps

In [2]:
start = time.time()

#------------------------------Using Raw Census PLine Output---------------------------------

def extract_PLines(census_file):
    """
        Input: Census filled file (extracting intensities from P-Line)
        Output: New dataframe containing data from census PLine
    """
    with open(census_file, 'r') as f:
        file1 = f.readlines()
    with open('dump1.txt', 'w') as g:    
        for line1 in file1:
            line2 = line1.split('\t')
            if line2[0] == 'PLINE':
                g.write(line1)
            elif line2[0] == 'P':
                g.write(line1)
    temp_df1 = pd.read_csv('dump1.txt', sep='\t', na_values='-').fillna(0)
    return temp_df1

def extract_Norm_Raw_PLine_Intensities(temp_df_NR1):
    """
        Input: dataframe derived from 'extract_PLines' 
        Output: new dataframe containing normalized intensities for each protein/sample
    """
    #Import Select Columns to new dataframe then add 1 to every intensity value (to ensure no 0 values)
    temp_df_NR2 = temp_df_NR1[['ACCESSION', 'DESCRIPTION', 'NORM_INTENSITY_1', 'NORM_INTENSITY_2', 'NORM_INTENSITY_3', 'NORM_INTENSITY_4', 'NORM_INTENSITY_5', 'NORM_INTENSITY_6', 'NORM_INTENSITY_7', 'NORM_INTENSITY_8', 'NORM_INTENSITY_9', 'NORM_INTENSITY_10', 'NORM_INTENSITY_11', 'NORM_INTENSITY_12', 'NORM_INTENSITY_13', 'NORM_INTENSITY_14', 'NORM_INTENSITY_15', 'NORM_INTENSITY_16', 'NORM_INTENSITY_17']].copy()
    temp_df_NR2[['NORM_INTENSITY_1', 'NORM_INTENSITY_2', 'NORM_INTENSITY_3', 'NORM_INTENSITY_4', 'NORM_INTENSITY_5', 'NORM_INTENSITY_6', 'NORM_INTENSITY_7', 'NORM_INTENSITY_8', 'NORM_INTENSITY_9', 'NORM_INTENSITY_10', 'NORM_INTENSITY_11', 'NORM_INTENSITY_12', 'NORM_INTENSITY_13', 'NORM_INTENSITY_14', 'NORM_INTENSITY_15', 'NORM_INTENSITY_16', 'NORM_INTENSITY_17']] += 1
    return temp_df_NR2

def extract_Raw_PLine_Intensities(temp_df_R1):
    """
        Input: dataframe derived from 'extract_PLines'
        Output: new dataframe containing non-normalized intensities for each protein/sample
    """
    #Import Select Columns to new dataframe then add 1 to every intensity value (to ensure no 0 values)
    temp_df_R2 = temp_df_R1[['ACCESSION', 'DESCRIPTION', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']].copy()
    temp_df_R2[['INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']] += 1
    return temp_df_R2

def normalize_Raw_PLine_Intensities(temp_df_R3):
    """
        Input: non-normalized intensities dataframe from 'extract_Raw_PLine_Intensities' function
        Output: return same dataframe with normalized intensity values; each value in a column is divided by its specific column sum
    """
    temp_allsums1 = temp_df_R3.sum(axis=0, skipna = True)
    temp_df_R3['INTENSITY_1'] /= temp_allsums1['INTENSITY_1']
    temp_df_R3['INTENSITY_2'] /= temp_allsums1['INTENSITY_2']
    temp_df_R3['INTENSITY_3'] /= temp_allsums1['INTENSITY_3']
    temp_df_R3['INTENSITY_4'] /= temp_allsums1['INTENSITY_4']
    temp_df_R3['INTENSITY_5'] /= temp_allsums1['INTENSITY_5']
    temp_df_R3['INTENSITY_6'] /= temp_allsums1['INTENSITY_6']
    temp_df_R3['INTENSITY_7'] /= temp_allsums1['INTENSITY_7']
    temp_df_R3['INTENSITY_8'] /= temp_allsums1['INTENSITY_8']
    temp_df_R3['INTENSITY_9'] /= temp_allsums1['INTENSITY_9']
    temp_df_R3['INTENSITY_10'] /= temp_allsums1['INTENSITY_10']
    temp_df_R3['INTENSITY_11'] /= temp_allsums1['INTENSITY_11']
    temp_df_R3['INTENSITY_12'] /= temp_allsums1['INTENSITY_12']
    temp_df_R3['INTENSITY_13'] /= temp_allsums1['INTENSITY_13']
    temp_df_R3['INTENSITY_14'] /= temp_allsums1['INTENSITY_14']
    temp_df_R3['INTENSITY_15'] /= temp_allsums1['INTENSITY_15']
    temp_df_R3['INTENSITY_16'] /= temp_allsums1['INTENSITY_16']
    temp_df_R3['INTENSITY_17'] /= temp_allsums1['INTENSITY_17']
    return temp_df_R3

#-------------------------------------------------------------------------------------------

#----------------------------Clustering Proteins - Census SLine-----------------------------


def extract_SLine_from_Census(census_file):
    """
        Input: Census 'filled' file
        Output: text dump and returns a peptide dataframe
    """
    with open(census_file, 'r') as f:
        file1 = f.readlines()
    with open('dump1.txt', 'w') as g:
        for line1 in file1:
            if (line1[0] == 'SLINE') or (line1[0] == 'S'):
                g.write(line1)
    df1 = pd.read_csv('dump1.txt', sep='\t', header=0).fillna('0')
    return df1

def select_Census_Columns_Peptides(temp_df1):
    """
        Input: raw census peptide (SLine) dataframe [Reliant on 'extract_SLine_from_Census']
        Output: new cleaned up dataframe containing peptides (no duplicates) and corresponding intensities for each run
    """
    #Note that columns below represent Peptide sequence and Intensity Columns -- CHANGE AS NECESSARY
    temp_df2 = temp_df1[['SEQUENCE','INTENSITY_1','INTENSITY_2','INTENSITY_3','INTENSITY_4','INTENSITY_5','INTENSITY_6','INTENSITY_7','INTENSITY_8','INTENSITY_9','INTENSITY_10','INTENSITY_11','INTENSITY_12','INTENSITY_13','INTENSITY_14','INTENSITY_15','INTENSITY_16','INTENSITY_17']].copy()
    
    #Remove rows containing duplicate information (i.e. intensities); Duplicate peptides with different intensities are kept
    temp_df3 = temp_df2.drop_duplicates(keep='first', inplace=False)
    
    #Reformatting Peptides to remove C- and N-terminal cleavage sites and diff-mod sites
    temp_col1 = temp_df3['SEQUENCE']
    temp_col2 = []
    for temp_position1,temp_item1 in enumerate(temp_col1):
        temp_col2.append(temp_item1[2:-2].replace('(15.994915)', ''))
    temp_df3 = temp_df3.reset_index()
    del temp_df3['index']
    temp_df4 = temp_df3.copy()
    temp_df4['PEPTIDE'] = pd.Series(temp_col2, index=temp_df4.index)
    
    #Create new cleaned up dataframe
    temp_df5 = temp_df4[['PEPTIDE', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']]
    temp_df6 = temp_df5.drop_duplicates(keep='first', inplace=False)
    temp_df6 = temp_df6.reset_index()
    del temp_df6['index']
    
    #Sort new cleaned up dataframe and sum intensities columns for identical peptides
    temp_df6.sort_values('PEPTIDE')
    temp_df7 = temp_df6.groupby(['PEPTIDE']).sum()
    return temp_df7

def extract_PLine_from_Census(census_file):
    """
        Input: census "filled" file
        Output: text dump and returns a dictionary connecting peptides (key) and protein list (value) redundant proteins included
    """
    #open census file
    with open(census_file, 'r') as f:
        file1 = f.readlines()
    #generate new file containing protein PLines
    with open('dump2.txt', 'w') as g:
        temp_pep_list1 = []
        for line1 in file1:
            line2 = line1.split('\t')
            if (line2[0] == 'P'):
                temp_pep_list2 = list(set(temp_pep_list1))
                for item1 in temp_pep_list2:
                    g.write('\t' + item1 + '\n')
                temp_pep_list1 = []
                g.write(line2[1] + '\t' + line2[2] + '\n')
            elif (line2[0] == 'S'):
                temp_pep_list1.append(line2[2][2:-2].replace('(15.994915)', ''))
        temp_pep_list2 = list(set(temp_pep_list1))
        for item1 in temp_pep_list2:
            g.write('\t' + item1 + '\n')
            temp_pep_list1 = []
    
    #open newly created protein PLine file
    with open('dump2.txt', 'r') as f:
        file2 = f.readlines()
    
    #create peptide(key)-protein(value) dictionary; proteins are assembled in list
    Pep_to_Prot_Dict1 = {}
    temp_prot1 = ""
    for line1 in file2:
        line2 = line1.split('\t')
        if line2[0] != '':
            temp_prot1 = line2[0]
        else:
            if line2[1].replace('\n', '') in Pep_to_Prot_Dict1.keys():
                Pep_to_Prot_Dict1[line2[1].replace('\n', '')].append(temp_prot1)
            else:
                Pep_to_Prot_Dict1[line2[1].replace('\n', '')] = []
                Pep_to_Prot_Dict1[line2[1].replace('\n', '')].append(temp_prot1)
    return Pep_to_Prot_Dict1

def map_Clusters_to_Peptides(cluster_file):
    """
        Input: CDHIT cluster file
        Output: dictionary connecting Protein (keys) to Cluster number (values)
    """
    #create dictionary connecting protein (keys) and cluster number (values)
    with open(cluster_file, 'r') as f:
        file1 = f.readlines() 
    temp_cluster_num = ""
    Protein_to_Cluster_Dict1 = {}
    for line1 in file1:
        line2 = line1.split('\t')        
        if '>' in line2[0]:
            temp_cluster_num = line2[0].replace('>', '').replace('\n', '').replace('Cluster ', '')
        else:
            line3 = line2[1].split(' ')
            line4 = line3[1].split('|')
            line5 = line4[1]
            Protein_to_Cluster_Dict1[line5] = temp_cluster_num
    return Protein_to_Cluster_Dict1

def map_Clusternum_to_Peptides(prot_to_clust_dict2, pep_to_prot_dict2):
    """
        Input: 2 dictionaries: protein-cluster and peptide-protein
        Output: dictionary linking peptides(key) to cluster number (value)
    """
    pep_to_cluster_dict1 = {}
    for item1 in pep_to_prot_dict2.keys():
        temp_protlist1 = pep_to_prot_dict2[item1]
        temp_cluster_list1 = []
        for item2 in temp_protlist1:
            try:
                temp_cluster_list1.append(prot_to_clust_dict2[item2])
            except:
                pass
        pep_to_cluster_dict1[item1] = list(set(temp_cluster_list1))
    return pep_to_cluster_dict1

def clusternum_to_PepDataframe(pep_to_clust_dict, uniquepep_dataframe):
    """
        Input: Peptide-to-Cluster dictionary generated from 'map_Clusternum_to_Peptides' function and cleaned-up Peptide-Intensity dataframe from 'select_Census_Columns_Peptides' function
        Output: New peptide-intensity dataframe with integrated cluster number column; Note that peptides with no cluster belong to contaminant or reverse proteins
    """
    temp_df1 = uniquepep_dataframe.reset_index()
    temp_list1 = list(temp_df1['PEPTIDE'])
    temp_clusterlist1 = []
    for item1 in temp_list1:
        temp_clusterlist1.append(pep_to_clust_dict[item1])
    temp_df1['CLUSTER'] = pd.Series(temp_clusterlist1, index=temp_df1.index)
    temp_df2 = temp_df1[['PEPTIDE', 'CLUSTER', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']]
    return temp_df2

def extract_Intensity_Master(census_file1, cluster_file1):
    """
        Input: census 'filled' file and CDHIT cluster file
        Output: Dataframe with Peptide, Cluster, and Intensity Columns
    """
    temp_df1A = extract_SLine_from_Census(census_file1)
    temp_df2A = select_Census_Columns_Peptides(temp_df1A)
    temp_dict1A = extract_PLine_from_Census(census_file1)
    temp_dict2A = map_Clusters_to_Peptides(cluster_file1)
    temp_dict3A = map_Clusternum_to_Peptides(temp_dict2A, temp_dict1A)
    temp_df3A = clusternum_to_PepDataframe(temp_dict3A, temp_df2A)
    return temp_df3A

def create_Cluster_Intensity_Table(complete_dataframe):
    """
        Input: complete master dataframe from 'extract_Intensity_Master'
        Output: new dataframe with intensities summed by cluster
    """
    #Collect only peptides that map to 1 cluster
    temp_df1A = complete_dataframe.loc[complete_dataframe['CLUSTER'].str.len() == 1]
    #Create new column (ClusterID) then sum intensities belonging to the same cluster
    temp_list1 = temp_df1A['CLUSTER']
    temp_list2 = []
    for item1 in temp_list1:
        temp_list2.append(item1[0])
    temp_df1A['ClusterID'] = pd.Series(temp_list2, index=temp_df1A.index)
    temp_df1A = temp_df1A[['PEPTIDE', 'CLUSTER', 'ClusterID', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']]
    cluster_df1 = temp_df1A.groupby(['ClusterID']).sum().copy()
    #Add 1 to each value to ensure no 0 values (necessary for Log transform and subsequent hypothesis testing)
    cluster_df1[['INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']] += 0
    return cluster_df1

def normalize_Cluster_Intensity_Table(complete_dataframe_2):
    """
        Input: Cluster-Intensity dataframe
        Output: New Cluster-Intensity dataframe with each entry normalized by dividing by respective column sum (note that intensities from peptides mapping to multiple clusters tossed)
    """
    temp_allsums1 = complete_dataframe_2.sum(axis=0, skipna = True)
    complete_dataframe_2['INTENSITY_1'] /= temp_allsums1['INTENSITY_1']
    complete_dataframe_2['INTENSITY_2'] /= temp_allsums1['INTENSITY_2']
    complete_dataframe_2['INTENSITY_3'] /= temp_allsums1['INTENSITY_3']
    complete_dataframe_2['INTENSITY_4'] /= temp_allsums1['INTENSITY_4']
    complete_dataframe_2['INTENSITY_5'] /= temp_allsums1['INTENSITY_5']
    complete_dataframe_2['INTENSITY_6'] /= temp_allsums1['INTENSITY_6']
    complete_dataframe_2['INTENSITY_7'] /= temp_allsums1['INTENSITY_7']
    complete_dataframe_2['INTENSITY_8'] /= temp_allsums1['INTENSITY_8']
    complete_dataframe_2['INTENSITY_9'] /= temp_allsums1['INTENSITY_9']
    complete_dataframe_2['INTENSITY_10'] /= temp_allsums1['INTENSITY_10']
    complete_dataframe_2['INTENSITY_11'] /= temp_allsums1['INTENSITY_11']
    complete_dataframe_2['INTENSITY_12'] /= temp_allsums1['INTENSITY_12']
    complete_dataframe_2['INTENSITY_13'] /= temp_allsums1['INTENSITY_13']
    complete_dataframe_2['INTENSITY_14'] /= temp_allsums1['INTENSITY_14']
    complete_dataframe_2['INTENSITY_15'] /= temp_allsums1['INTENSITY_15']
    complete_dataframe_2['INTENSITY_16'] /= temp_allsums1['INTENSITY_16']
    complete_dataframe_2['INTENSITY_17'] /= temp_allsums1['INTENSITY_17']
    return complete_dataframe_2


#-------------------------------------------------------------------------------------------

#----------------------------Formatting and Annotating R-output-----------------------------

def annotate_R_abridged(cdhit_file, abridged_R_file):
    """
        Input: CDHIT cluster file and abridged R outputfile containing p-adj values
        Output: newly annotated dataframe
    """
    with open(cdhit_file, 'r') as f:
        file1 = f.readlines()
    cluster_representative_dict1 = {}
    for line1 in file1:
        line2 = line1.split('\t')
        if line1[0] == '>':
            temp_clusternum = line2[0].replace('>Cluster ', '').replace('\n', '')
        elif '... *' in line2[1]:
            line3 = line2[1].split(' ')
            line4 = line3[1].split('|')
            line5 = line4[1]
            cluster_representative_dict1[temp_clusternum] = line5
            temp_clusternum = ""
    with open(abridged_R_file, 'r') as g:
        file2 = g.readlines()
    cluster_rep_list = []
    for line1 in file2:
        line2 = line1.split(',')
        if line2[1] in cluster_representative_dict1.keys():
            cluster_rep_list.append(cluster_representative_dict1[line2[1]])
    temp_dfC1 = pd.read_csv(abridged_R_file) 
    temp_dfC1['cluster_representative'] = pd.Series(cluster_rep_list, index=temp_dfC1.index)
    return temp_dfC1

#-------------------------------------------------------------------------------------------

end = time.time()
print(end-start)

0.002744913101196289


In [15]:
start = time.time()

#---------------Prepare Tables for R-analysis of Census PLine Data-------------------

#convert census_filled into file with only PLines
censusP_df1 = extract_PLines('Census/census-chat-mouse-20318_filled_modded.txt')

#extract specific columns from converted census file
Norm_PLine_df1 = extract_Norm_Raw_PLine_Intensities(censusP_df1)
Non_Norm_PLine_df1 = extract_Raw_PLine_Intensities(censusP_df1)

#normalize columns in Non_Norm extracted file
Non_Norm_PLine_df1 = normalize_Raw_PLine_Intensities(Non_Norm_PLine_df1)

#output files to csv ready for R-analysis
Norm_PLine_df1.to_csv('PLine-ChatMouse-NormIntensityTable1.txt', index=False)
Non_Norm_PLine_df1.to_csv('PLine-ChatMouse-NonNormIntensityTable1.txt', index=False)

#------------------------------------------------------------------------------------

end = time.time()
print(end-start)

1.121624231338501


In [3]:
start = time.time()

#--------Prepare Tables for R-analysis of Census SLine Data (Clustering)-------------

temp_chatmouse_65 = extract_Intensity_Master('Census/census-chat-mouse-20318_filled_modded.txt', 'CDHIT/cdhitout-mouse-65.clstr')
chatmouse_65 = create_Cluster_Intensity_Table(temp_chatmouse_65)
#chatmouse_65 = normalize_Cluster_Intensity_Table(chatmouse_65)

temp_chatmouse_75 = extract_Intensity_Master('Census/census-chat-mouse-20318_filled_modded.txt', 'CDHIT/cdhitout-mouse-75.clstr')
chatmouse_75 = create_Cluster_Intensity_Table(temp_chatmouse_75)
#chatmouse_75 = normalize_Cluster_Intensity_Table(chatmouse_75)

temp_chatmouse_85 = extract_Intensity_Master('Census/census-chat-mouse-20318_filled_modded.txt', 'CDHIT/cdhitout-mouse-85.clstr')
chatmouse_85 = create_Cluster_Intensity_Table(temp_chatmouse_85)
#chatmouse_85 = normalize_Cluster_Intensity_Table(chatmouse_85)

temp_chatmouse_95 = extract_Intensity_Master('Census/census-chat-mouse-20318_filled_modded.txt', 'CDHIT/cdhitout-mouse-95.clstr')
chatmouse_95 = create_Cluster_Intensity_Table(temp_chatmouse_95)
#chatmouse_95 = normalize_Cluster_Intensity_Table(chatmouse_95)

#Output to txt/csv file
chatmouse_65.to_csv('SLine-ChatMouse-NormIntensityTable-65.txt')
chatmouse_75.to_csv('SLine-ChatMouse-NormIntensityTable-75.txt')
chatmouse_85.to_csv('SLine-ChatMouse-NormIntensityTable-85.txt')
chatmouse_95.to_csv('SLine-ChatMouse-NormIntensityTable-95.txt')

#------------------------------------------------------------------------------------

end = time.time()
print(end-start)

  exec(code_obj, self.user_global_ns, self.user_ns)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


738.151624917984


In [20]:
chatmouse_95[chatmouse_95.eq(0).any(1)]

Unnamed: 0_level_0,INTENSITY_1,INTENSITY_2,INTENSITY_3,INTENSITY_4,INTENSITY_5,INTENSITY_6,INTENSITY_7,INTENSITY_8,INTENSITY_9,INTENSITY_10,INTENSITY_11,INTENSITY_12,INTENSITY_13,INTENSITY_14,INTENSITY_15,INTENSITY_16,INTENSITY_17
ClusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1


In [5]:
start = time.time()

#---------------Add Protein Labels to R-analysis of Census SLine Data----------------

chatmouse_65_log_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-65.clstr', 'Census_analysis/SLine-ChatMouse-Census-Log2Transform-65.csv')
chatmouse_65_log_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_65_log_R1.to_csv('SLine-ChatMouse-Census-Log2Transform-65-annotated.txt', index=False)

chatmouse_75_log_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-75.clstr', 'Census_analysis/SLine-ChatMouse-Census-Log2Transform-75.csv')
chatmouse_75_log_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_75_log_R1.to_csv('SLine-ChatMouse-Census-Log2Transform-75-annotated.txt', index=False)

chatmouse_85_log_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-85.clstr', 'Census_analysis/SLine-ChatMouse-Census-Log2Transform-85.csv')
chatmouse_85_log_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_85_log_R1.to_csv('SLine-ChatMouse-Census-Log2Transform-85-annotated.txt', index=False)

chatmouse_95_log_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-95.clstr', 'Census_analysis/SLine-ChatMouse-Census-Log2Transform-95.csv')
chatmouse_95_log_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_95_log_R1.to_csv('SLine-ChatMouse-Census-Log2Transform-95-annotated.txt', index=False)


#----------

chatmouse_65_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-65.clstr', 'Census_analysis/SLine-ChatMouse-Census-cuberootTransform-65.csv')
chatmouse_65_3r_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_65_3r_R1.to_csv('SLine-ChatMouse-Census-cuberootTransform-65-annotated.txt', index=False)

chatmouse_75_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-75.clstr', 'Census_analysis/SLine-ChatMouse-Census-cuberootTransform-75.csv')
chatmouse_75_3r_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_75_3r_R1.to_csv('SLine-ChatMouse-Census-cuberootTransform-75-annotated.txt', index=False)

chatmouse_85_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-85.clstr', 'Census_analysis/SLine-ChatMouse-Census-cuberootTransform-85.csv')
chatmouse_85_3r_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_85_3r_R1.to_csv('SLine-ChatMouse-Census-cuberootTransform-85-annotated.txt', index=False)

chatmouse_95_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-mouse-95.clstr', 'Census_analysis/SLine-ChatMouse-Census-cuberootTransform-95.csv')
chatmouse_95_3r_R1.drop(columns='Unnamed: 0', inplace=True)
chatmouse_95_3r_R1.to_csv('SLine-ChatMouse-Census-cuberootTransform-95-annotated.txt', index=False)


#------------------------------------------------------------------------------------

end = time.time()
print(end-start)

1.1139299869537354


In [56]:
chatmouse_65_t_R1

Unnamed: 0,ClusterID,INTENSITY_1,INTENSITY_2,INTENSITY_3,INTENSITY_4,INTENSITY_5,INTENSITY_6,INTENSITY_7,INTENSITY_8,INTENSITY_9,...,INTENSITY_14,INTENSITY_15,INTENSITY_16,INTENSITY_17,p_value,control_mean,treatment_mean,foldchange,padj,cluster_representative
1,64,-15.684713,-14.462591,-14.551343,-14.294296,-12.775558,-14.155657,-14.021253,-13.992334,-13.423281,...,-17.551288,-16.745220,-18.531755,-15.236075,0.000016,-14.151225,-16.913045,1.195165,0.010084,E9Q2E4
2,14147,-9.629501,-8.860266,-9.234132,-8.795709,-10.685092,-8.987012,-9.585652,-8.350959,-9.401814,...,-11.104943,-10.942521,-10.898712,-10.325736,0.000078,-9.281126,-10.681887,1.150926,0.016900,O08807
3,14346,-10.258886,-10.580672,-10.171253,-10.064396,-11.048504,-10.340718,-10.512314,-10.276662,-10.811294,...,-8.833867,-8.945664,-9.264812,-8.508317,0.000077,-10.451633,-9.291170,0.888968,0.016900,E9PZF0
4,149,-14.439948,-12.792543,-11.844518,-12.065448,-11.557351,-12.447301,-11.718392,-13.183770,-11.844561,...,-15.324057,-14.217472,-15.093547,-14.840587,0.000056,-12.432648,-14.532275,1.168880,0.016900,Q8CHI8
5,6533,-13.363339,-13.945403,-13.468996,-13.763317,-13.950951,-13.287516,-12.278385,-12.552375,-14.337052,...,-9.672006,-10.872345,-10.271885,-12.453936,0.000080,-13.438593,-10.913286,0.812085,0.016900,P63038
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1259,7102,-14.707948,-15.411485,-15.576592,-14.262365,-16.727032,-15.974162,-15.526899,-15.510765,-16.724519,...,-15.592431,-16.149231,-14.750926,-14.932707,0.991892,-15.602419,-15.709525,1.006865,0.995041,Q6NXH9
1260,14915,-14.058981,-13.735708,-14.401319,-14.697381,-14.580487,-14.617650,-14.350400,-13.592329,-14.233686,...,-12.352471,-12.897237,-12.813253,-14.636399,0.993152,-14.251994,-14.586703,1.023485,0.995515,Q9CRD0
1261,15652,-16.287852,-16.618478,-17.152414,-17.925100,-16.822791,-16.483510,-17.782845,-18.315386,-16.786650,...,-17.210977,-16.978411,-14.687111,-17.306739,0.995969,-17.130558,-17.191615,1.003564,0.996336,Q9D142
1262,17573,-8.385133,-7.537781,-8.236383,-8.240587,-8.975537,-8.471668,-7.799192,-8.447231,-7.292174,...,-8.156601,-7.807598,-7.935925,-8.666720,0.996336,-8.153965,-8.144178,0.998800,0.996336,P01592
