In [1]:
import time
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from matplotlib import cm
from collections import OrderedDict
import matplotlib as mpl
import matplotlib.colors as colors
import re

import bokeh.io
import bokeh.models
import bokeh.palettes
import bokeh.plotting
import seaborn as sns

pd.set_option('display.max_rows', 100)

bokeh.io.output_notebook

import scipy.stats as sps

In [2]:
start = time.time()

#------------------------------Using Raw Census PLine Output---------------------------------

def extract_PLines(census_file):
    """
        Input: Census filled file (extracting intensities from P-Line)
        Output: New dataframe containing data from census PLine
    """
    with open(census_file, 'r') as f:
        file1 = f.readlines()
    with open('dump1.txt', 'w') as g:    
        for line1 in file1:
            line2 = line1.split('\t')
            if line2[0] == 'PLINE':
                g.write(line1)
            elif line2[0] == 'P':
                g.write(line1)
    temp_df1 = pd.read_csv('dump1.txt', sep='\t', na_values='-').fillna(0)
    return temp_df1

def extract_Norm_Raw_PLine_Intensities(temp_df_NR1):
    """
        Input: dataframe derived from 'extract_PLines' 
        Output: new dataframe containing normalized intensities for each protein/sample
    """
    #Import Select Columns to new dataframe then add 1 to every intensity value (to ensure no 0 values)
    temp_df_NR2 = temp_df_NR1[['ACCESSION', 'DESCRIPTION', 'NORM_INTENSITY_1', 'NORM_INTENSITY_2', 'NORM_INTENSITY_3', 'NORM_INTENSITY_4', 'NORM_INTENSITY_5', 'NORM_INTENSITY_6', 'NORM_INTENSITY_7', 'NORM_INTENSITY_8', 'NORM_INTENSITY_9', 'NORM_INTENSITY_10', 'NORM_INTENSITY_11', 'NORM_INTENSITY_12', 'NORM_INTENSITY_13', 'NORM_INTENSITY_14', 'NORM_INTENSITY_15', 'NORM_INTENSITY_16', 'NORM_INTENSITY_17']].copy()
    temp_df_NR2[['NORM_INTENSITY_1', 'NORM_INTENSITY_2', 'NORM_INTENSITY_3', 'NORM_INTENSITY_4', 'NORM_INTENSITY_5', 'NORM_INTENSITY_6', 'NORM_INTENSITY_7', 'NORM_INTENSITY_8', 'NORM_INTENSITY_9', 'NORM_INTENSITY_10', 'NORM_INTENSITY_11', 'NORM_INTENSITY_12', 'NORM_INTENSITY_13', 'NORM_INTENSITY_14', 'NORM_INTENSITY_15', 'NORM_INTENSITY_16', 'NORM_INTENSITY_17']] += 1
    return temp_df_NR2

def extract_Raw_PLine_Intensities(temp_df_R1):
    """
        Input: dataframe derived from 'extract_PLines'
        Output: new dataframe containing non-normalized intensities for each protein/sample
    """
    #Import Select Columns to new dataframe then add 1 to every intensity value (to ensure no 0 values)
    temp_df_R2 = temp_df_R1[['ACCESSION', 'DESCRIPTION', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']].copy()
    temp_df_R2[['INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14', 'INTENSITY_15', 'INTENSITY_16', 'INTENSITY_17']] += 1
    return temp_df_R2

def normalize_Raw_PLine_Intensities(temp_df_R3):
    """
        Input: non-normalized intensities dataframe from 'extract_Raw_PLine_Intensities' function
        Output: return same dataframe with normalized intensity values; each value in a column is divided by its specific column sum
    """
    temp_allsums1 = temp_df_R3.sum(axis=0, skipna = True)
    temp_df_R3['INTENSITY_1'] /= temp_allsums1['INTENSITY_1']
    temp_df_R3['INTENSITY_2'] /= temp_allsums1['INTENSITY_2']
    temp_df_R3['INTENSITY_3'] /= temp_allsums1['INTENSITY_3']
    temp_df_R3['INTENSITY_4'] /= temp_allsums1['INTENSITY_4']
    temp_df_R3['INTENSITY_5'] /= temp_allsums1['INTENSITY_5']
    temp_df_R3['INTENSITY_6'] /= temp_allsums1['INTENSITY_6']
    temp_df_R3['INTENSITY_7'] /= temp_allsums1['INTENSITY_7']
    temp_df_R3['INTENSITY_8'] /= temp_allsums1['INTENSITY_8']
    temp_df_R3['INTENSITY_9'] /= temp_allsums1['INTENSITY_9']
    temp_df_R3['INTENSITY_10'] /= temp_allsums1['INTENSITY_10']
    temp_df_R3['INTENSITY_11'] /= temp_allsums1['INTENSITY_11']
    temp_df_R3['INTENSITY_12'] /= temp_allsums1['INTENSITY_12']
    temp_df_R3['INTENSITY_13'] /= temp_allsums1['INTENSITY_13']
    temp_df_R3['INTENSITY_14'] /= temp_allsums1['INTENSITY_14']
    temp_df_R3['INTENSITY_15'] /= temp_allsums1['INTENSITY_15']
    temp_df_R3['INTENSITY_16'] /= temp_allsums1['INTENSITY_16']
    temp_df_R3['INTENSITY_17'] /= temp_allsums1['INTENSITY_17']
    return temp_df_R3

#-------------------------------------------------------------------------------------------

#----------------------------Clustering Proteins - Census SLine-----------------------------


def extract_SLine_from_Census(census_file):
    """
        Input: Census 'filled' file
        Output: text dump and returns a peptide dataframe
    """
    with open(census_file, 'r') as f:
        file1 = f.readlines()
    with open('dump1.txt', 'w') as g:
        for line1 in file1:
            if (line1[0] == 'SLINE') or (line1[0] == 'S'):
                g.write(line1)
    df1 = pd.read_csv('dump1.txt', sep='\t', header=0).fillna('0')
    return df1

def select_Census_Columns_Peptides(temp_df1):
    """
        Input: raw census peptide (SLine) dataframe [Reliant on 'extract_SLine_from_Census']
        Output: new cleaned up dataframe containing peptides (no duplicates) and corresponding intensities for each run
    """
    #Note that columns below represent Peptide sequence and Intensity Columns -- CHANGE AS NECESSARY
    temp_df2 = temp_df1[['SEQUENCE_1','SEQUENCE_2','SEQUENCE_3','SEQUENCE_4','SEQUENCE_5','SEQUENCE_6','SEQUENCE_7','SEQUENCE_8','SEQUENCE_9','SEQUENCE_10','SEQUENCE_11','SEQUENCE_12','SEQUENCE_13','SEQUENCE_14','INTENSITY_1','INTENSITY_2','INTENSITY_3','INTENSITY_4','INTENSITY_5','INTENSITY_6','INTENSITY_7','INTENSITY_8','INTENSITY_9','INTENSITY_10','INTENSITY_11','INTENSITY_12','INTENSITY_13','INTENSITY_14']].copy()
    
    #Find sequence from dataframe SEQUENCE_# values
    temp_SEQUENCE = []
    for row1 in temp_df2.itertuples():
        temp_list1 = []
        temp_list1.append(row1.SEQUENCE_1)
        temp_list1.append(row1.SEQUENCE_2)
        temp_list1.append(row1.SEQUENCE_3)
        temp_list1.append(row1.SEQUENCE_4)
        temp_list1.append(row1.SEQUENCE_5)
        temp_list1.append(row1.SEQUENCE_6)
        temp_list1.append(row1.SEQUENCE_7)
        temp_list1.append(row1.SEQUENCE_8)
        temp_list1.append(row1.SEQUENCE_9)
        temp_list1.append(row1.SEQUENCE_10)
        temp_list1.append(row1.SEQUENCE_11)
        temp_list1.append(row1.SEQUENCE_12)
        temp_list1.append(row1.SEQUENCE_13)
        temp_list1.append(row1.SEQUENCE_14)
        item2 = 0
        for item1 in temp_list1:
            if item1 != '0':
                item2 = item1
                break
        temp_SEQUENCE.append(item2)
    temp_df2['SEQUENCE'] = pd.Series(temp_SEQUENCE, index=temp_df2.index)
    temp_df2 = temp_df2[['SEQUENCE','INTENSITY_1','INTENSITY_2','INTENSITY_3','INTENSITY_4','INTENSITY_5','INTENSITY_6','INTENSITY_7','INTENSITY_8','INTENSITY_9','INTENSITY_10','INTENSITY_11','INTENSITY_12','INTENSITY_13','INTENSITY_14']].copy()
    temp_df2['INTENSITY_1'] = pd.to_numeric(temp_df2['INTENSITY_1'], errors='coerce')
    temp_df2['INTENSITY_2'] = pd.to_numeric(temp_df2['INTENSITY_2'], errors='coerce')
    temp_df2['INTENSITY_3'] = pd.to_numeric(temp_df2['INTENSITY_3'], errors='coerce')
    temp_df2['INTENSITY_4'] = pd.to_numeric(temp_df2['INTENSITY_4'], errors='coerce')
    temp_df2['INTENSITY_5'] = pd.to_numeric(temp_df2['INTENSITY_5'], errors='coerce')
    temp_df2['INTENSITY_6'] = pd.to_numeric(temp_df2['INTENSITY_6'], errors='coerce')
    temp_df2['INTENSITY_7'] = pd.to_numeric(temp_df2['INTENSITY_7'], errors='coerce')
    temp_df2['INTENSITY_8'] = pd.to_numeric(temp_df2['INTENSITY_8'], errors='coerce')
    temp_df2['INTENSITY_9'] = pd.to_numeric(temp_df2['INTENSITY_9'], errors='coerce')
    temp_df2['INTENSITY_10'] = pd.to_numeric(temp_df2['INTENSITY_10'], errors='coerce')
    temp_df2['INTENSITY_11'] = pd.to_numeric(temp_df2['INTENSITY_11'], errors='coerce')
    temp_df2['INTENSITY_12'] = pd.to_numeric(temp_df2['INTENSITY_12'], errors='coerce')
    temp_df2['INTENSITY_13'] = pd.to_numeric(temp_df2['INTENSITY_13'], errors='coerce')
    temp_df2['INTENSITY_14'] = pd.to_numeric(temp_df2['INTENSITY_14'], errors='coerce')
    temp_df2 = temp_df2.fillna(0)
    
    #Remove rows containing duplicate information (i.e. intensities); Duplicate peptides with different intensities are kept
    temp_df3 = temp_df2.drop_duplicates(keep='first', inplace=False)
    
    #Reformatting Peptides to remove C- and N-terminal cleavage sites and diff-mod sites
    temp_col1 = temp_df3['SEQUENCE']
    temp_col2 = []
    for temp_position1,temp_item1 in enumerate(temp_col1):
        temp_col2.append(temp_item1[2:-2].replace('(15.994915)', ''))
    temp_df3 = temp_df3.reset_index()
    del temp_df3['index']
    temp_df4 = temp_df3.copy()
    temp_df4['PEPTIDE'] = pd.Series(temp_col2, index=temp_df4.index)
    
    #Create new cleaned up dataframe
    temp_df5 = temp_df4[['PEPTIDE', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14']]
    temp_df6 = temp_df5.drop_duplicates(keep='first', inplace=False)
    temp_df6 = temp_df6.reset_index()
    del temp_df6['index']
    
    #Sort new cleaned up dataframe and sum intensities columns for identical peptides
    temp_df6.sort_values('PEPTIDE')
    temp_df7 = temp_df6.groupby(['PEPTIDE']).sum()
    return temp_df7

def extract_PLine_from_Census(census_file):
    """
        Input: census "filled" file
        Output: text dump and returns a dictionary connecting peptides (key) and protein list (value) redundant proteins included
    """
    #open census file
    with open(census_file, 'r') as f:
        file1 = f.readlines()
    #generate new file containing protein PLines
    with open('dump2.txt', 'w') as g:
        temp_pep_list1 = []
        for line1 in file1:
            line2 = line1.split('\t')
            if (line2[0] == 'P'):
                temp_pep_list2 = list(set(temp_pep_list1))
                for item1 in temp_pep_list2:
                    g.write('\t' + item1 + '\n')
                temp_pep_list1 = []
                g.write(line2[1] + '\t' + line2[2] + '\n')
            elif (line2[0] == 'S'):
                item02 = ''
                for item01 in line2:
                    if '-.' in item01:
                        item02 = item01
                        break
                temp_pep_list1.append(item02[2:-2].replace('(15.994915)', ''))
        temp_pep_list2 = list(set(temp_pep_list1))
        for item1 in temp_pep_list2:
            g.write('\t' + item1 + '\n')
            temp_pep_list1 = []
    
    #open newly created protein PLine file
    with open('dump2.txt', 'r') as f:
        file2 = f.readlines()
    
    #create peptide(key)-protein(value) dictionary; proteins are assembled in list
    Pep_to_Prot_Dict1 = {}
    temp_prot1 = ""
    for line1 in file2:
        line2 = line1.split('\t')
        if line2[0] != '':
            temp_prot1 = line2[0]
        else:
            if line2[1].replace('\n', '') in Pep_to_Prot_Dict1.keys():
                Pep_to_Prot_Dict1[line2[1].replace('\n', '')].append(temp_prot1)
            else:
                Pep_to_Prot_Dict1[line2[1].replace('\n', '')] = []
                Pep_to_Prot_Dict1[line2[1].replace('\n', '')].append(temp_prot1)
    return Pep_to_Prot_Dict1

def map_Clusters_to_Proteins(cluster_file):
    """
        Input: CDHIT cluster file
        Output: dictionary connecting Protein (keys) to Cluster number (values)
    """
    #create dictionary connecting protein (keys) and cluster number (values)
    with open(cluster_file, 'r') as f:
        file1 = f.readlines() 
    temp_cluster_num = ""
    Protein_to_Cluster_Dict1 = {}
    for line1 in file1:
        line2 = line1.split('\t')        
        if '>' in line2[0]:
            temp_cluster_num = line2[0].replace('>', '').replace('\n', '').replace('Cluster ', '')
        else:
            line3 = line2[1].split(' ')
            line4 = line3[1].replace('>', '').replace('...', '')
            if '|' in line4:
                line4b = line4.split('|')
                line4 = line4b[1]
            if line4[0:7] != 'Reverse':
                Protein_to_Cluster_Dict1[line4] = temp_cluster_num
    return Protein_to_Cluster_Dict1

def map_Clusternum_to_Peptides(prot_to_clust_dict2, pep_to_prot_dict2):
    """
        Input: 2 dictionaries: protein-cluster and peptide-protein
        Output: dictionary linking peptides(key) to cluster number (value)
    """
    pep_to_cluster_dict1 = {}
    for item1 in pep_to_prot_dict2.keys():
        temp_protlist1 = pep_to_prot_dict2[item1]
        temp_cluster_list1 = []
        for item2 in temp_protlist1:
            try:
                temp_cluster_list1.append(prot_to_clust_dict2[item2])
            except:
                pass
        pep_to_cluster_dict1[item1] = list(set(temp_cluster_list1))
    return pep_to_cluster_dict1

def clusternum_to_PepDataframe(pep_to_clust_dict, uniquepep_dataframe):
    """
        Input: Peptide-to-Cluster dictionary generated from 'map_Clusternum_to_Peptides' function and cleaned-up Peptide-Intensity dataframe from 'select_Census_Columns_Peptides' function
        Output: New peptide-intensity dataframe with integrated cluster number column; Note that peptides with no cluster belong to contaminant or reverse proteins
    """
    temp_df1 = uniquepep_dataframe.reset_index()
    temp_list1 = list(temp_df1['PEPTIDE'])
    temp_clusterlist1 = []
    for item1 in temp_list1:
        temp_clusterlist1.append(pep_to_clust_dict[item1])
    temp_df1['CLUSTER'] = pd.Series(temp_clusterlist1, index=temp_df1.index)
    temp_df2 = temp_df1[['PEPTIDE', 'CLUSTER', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14']]
    return temp_df2

def extract_Intensity_Master(census_file1, cluster_file1):
    """
        Input: census 'filled' file and CDHIT cluster file
        Output: Dataframe with Peptide, Cluster, and Intensity Columns
    """
    temp_df1A = extract_SLine_from_Census(census_file1)
    temp_df2A = select_Census_Columns_Peptides(temp_df1A)
    temp_dict1A = extract_PLine_from_Census(census_file1)
    temp_dict2A = map_Clusters_to_Peptides(cluster_file1)
    temp_dict3A = map_Clusternum_to_Peptides(temp_dict2A, temp_dict1A)
    temp_df3A = clusternum_to_PepDataframe(temp_dict3A, temp_df2A)
    return temp_df3A

def create_Cluster_Intensity_Table(complete_dataframe):
    """
        Input: complete master dataframe from 'extract_Intensity_Master'
        Output: new dataframe with intensities summed by cluster
    """
    #Collect only peptides that map to 1 cluster
    temp_df1A = complete_dataframe.loc[complete_dataframe['CLUSTER'].str.len() == 1]
    #Create new column (ClusterID) then sum intensities belonging to the same cluster
    temp_list1 = temp_df1A['CLUSTER']
    temp_list2 = []
    for item1 in temp_list1:
        temp_list2.append(item1[0])
    temp_df1A['ClusterID'] = pd.Series(temp_list2, index=temp_df1A.index)
    temp_df1A = temp_df1A[['PEPTIDE', 'CLUSTER', 'ClusterID', 'INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14']]
    cluster_df1 = temp_df1A.groupby(['ClusterID']).sum().copy()    
    #Add 1 to each value to ensure no 0 values (necessary for Log transform and subsequent hypothesis testing)
    cluster_df1[['INTENSITY_1', 'INTENSITY_2', 'INTENSITY_3', 'INTENSITY_4', 'INTENSITY_5', 'INTENSITY_6', 'INTENSITY_7', 'INTENSITY_8', 'INTENSITY_9', 'INTENSITY_10', 'INTENSITY_11', 'INTENSITY_12', 'INTENSITY_13', 'INTENSITY_14']] += 0
    return cluster_df1

def normalize_Cluster_Intensity_Table(complete_dataframe_2):
    """
        Input: Cluster-Intensity dataframe
        Output: New Cluster-Intensity dataframe with each entry normalized by dividing by respective column sum (note that intensities from peptides mapping to multiple clusters tossed)
    """
    temp_allsums1 = complete_dataframe_2.sum(axis=0, skipna = True)
    complete_dataframe_2['INTENSITY_1'] /= temp_allsums1['INTENSITY_1']
    complete_dataframe_2['INTENSITY_2'] /= temp_allsums1['INTENSITY_2']
    complete_dataframe_2['INTENSITY_3'] /= temp_allsums1['INTENSITY_3']
    complete_dataframe_2['INTENSITY_4'] /= temp_allsums1['INTENSITY_4']
    complete_dataframe_2['INTENSITY_5'] /= temp_allsums1['INTENSITY_5']
    complete_dataframe_2['INTENSITY_6'] /= temp_allsums1['INTENSITY_6']
    complete_dataframe_2['INTENSITY_7'] /= temp_allsums1['INTENSITY_7']
    complete_dataframe_2['INTENSITY_8'] /= temp_allsums1['INTENSITY_8']
    complete_dataframe_2['INTENSITY_9'] /= temp_allsums1['INTENSITY_9']
    complete_dataframe_2['INTENSITY_10'] /= temp_allsums1['INTENSITY_10']
    complete_dataframe_2['INTENSITY_11'] /= temp_allsums1['INTENSITY_11']
    complete_dataframe_2['INTENSITY_12'] /= temp_allsums1['INTENSITY_12']
    complete_dataframe_2['INTENSITY_13'] /= temp_allsums1['INTENSITY_13']
    complete_dataframe_2['INTENSITY_14'] /= temp_allsums1['INTENSITY_14']
    return complete_dataframe_2


#-------------------------------------------------------------------------------------------

#----------------------------Formatting and Annotating R-output-----------------------------

def annotate_R_abridged(cdhit_file, abridged_R_file):
    """
        Input: CDHIT cluster file and abridged R outputfile containing p-adj values
        Output: newly annotated dataframe
    """
    with open(cdhit_file, 'r') as f:
        file1 = f.readlines()
    cluster_representative_dict1 = {}
    for line1 in file1:
        line2 = line1.split('\t')
        if line1[0] == '>':
            temp_clusternum = line2[0].replace('>Cluster ', '').replace('\n', '')
        elif '... *' in line2[1]:
            line3 = line2[1].split(' ')
            line4 = line3[1].replace('>', '').replace('...', '')
            cluster_representative_dict1[temp_clusternum] = line4
            temp_clusternum = ""
    with open(abridged_R_file, 'r') as g:
        file2 = g.readlines()
    cluster_rep_list = []
    for line1 in file2:
        line2 = line1.split(',')
        if line2[1] in cluster_representative_dict1.keys():
            cluster_rep_list.append(cluster_representative_dict1[line2[1]])
    temp_dfC1 = pd.read_csv(abridged_R_file) 
    temp_dfC1['cluster_representative'] = pd.Series(cluster_rep_list, index=temp_dfC1.index)
    return temp_dfC1

#-------------------------------------------------------------------------------------------

end = time.time()
print(end-start)

0.001775979995727539


In [15]:
start = time.time()

#---------------Prepare Tables for R-analysis of Census PLine Data-------------------

#convert census_filled into file with only PLines
censusP_df1 = extract_PLines('Census/census-chat-mouse-20318_filled_modded.txt')

#extract specific columns from converted census file
Norm_PLine_df1 = extract_Norm_Raw_PLine_Intensities(censusP_df1)
Non_Norm_PLine_df1 = extract_Raw_PLine_Intensities(censusP_df1)

#normalize columns in Non_Norm extracted file
Non_Norm_PLine_df1 = normalize_Raw_PLine_Intensities(Non_Norm_PLine_df1)

#output files to csv ready for R-analysis
Norm_PLine_df1.to_csv('PLine-ChatMouse-NormIntensityTable1.txt', index=False)
Non_Norm_PLine_df1.to_csv('PLine-ChatMouse-NonNormIntensityTable1.txt', index=False)

#------------------------------------------------------------------------------------

end = time.time()
print(end-start)

1.121624231338501


In [4]:
start = time.time()

#Only need to run once for each CDHIT-# analysis (take ~1.5 hours)
#temp_df1A = extract_SLine_from_Census('Census/census-TH-microbiome-20343_modded.txt')

#--------Prepare Tables for R-analysis of Census SLine Data (Clustering)-------------

#TH-Microbiome, CDHIT 65 (Run once per analysis)
temp_df2A = select_Census_Columns_Peptides(temp_df1A)
temp_dict1A = extract_PLine_from_Census('Census/census-TH-microbiome-20343_modded.txt')
temp_dict2A = map_Clusters_to_Proteins('CDHIT/cdhitout-TH-65.clstr')
temp_dict3A = map_Clusternum_to_Peptides(temp_dict2A, temp_dict1A)
temp_THMB_65 = clusternum_to_PepDataframe(temp_dict3A, temp_df2A)
THMB_65 = create_Cluster_Intensity_Table(temp_THMB_65)
#THMB_65 = normalize_Cluster_Intensity_Table(THMB_65)

#TH-Microbiome, CDHIT 75 (Run once per analysis)
temp_df2A = select_Census_Columns_Peptides(temp_df1A)
temp_dict1A = extract_PLine_from_Census('Census/census-TH-microbiome-20343_modded.txt')
temp_dict2A = map_Clusters_to_Proteins('CDHIT/cdhitout-TH-75.clstr')
temp_dict3A = map_Clusternum_to_Peptides(temp_dict2A, temp_dict1A)
temp_THMB_75 = clusternum_to_PepDataframe(temp_dict3A, temp_df2A)
THMB_75 = create_Cluster_Intensity_Table(temp_THMB_75)
#THMB_75 = normalize_Cluster_Intensity_Table(THMB_75)

#TH-Microbiome, CDHIT 85 (Run once per analysis)
temp_df2A = select_Census_Columns_Peptides(temp_df1A)
temp_dict1A = extract_PLine_from_Census('Census/census-TH-microbiome-20343_modded.txt')
temp_dict2A = map_Clusters_to_Proteins('CDHIT/cdhitout-TH-85.clstr')
temp_dict3A = map_Clusternum_to_Peptides(temp_dict2A, temp_dict1A)
temp_THMB_85 = clusternum_to_PepDataframe(temp_dict3A, temp_df2A)
THMB_85 = create_Cluster_Intensity_Table(temp_THMB_85)
#THMB_85 = normalize_Cluster_Intensity_Table(THMB_85)

#TH-Microbiome, CDHIT 95 (Run once per analysis)
temp_df2A = select_Census_Columns_Peptides(temp_df1A)
temp_dict1A = extract_PLine_from_Census('Census/census-TH-microbiome-20343_modded.txt')
temp_dict2A = map_Clusters_to_Proteins('CDHIT/cdhitout-TH-95.clstr')
temp_dict3A = map_Clusternum_to_Peptides(temp_dict2A, temp_dict1A)
temp_THMB_95 = clusternum_to_PepDataframe(temp_dict3A, temp_df2A)
THMB_95 = create_Cluster_Intensity_Table(temp_THMB_95)
#THMB_95 = normalize_Cluster_Intensity_Table(THMB_95)

#Output to txt/csv file
THMB_65.to_csv('SLine-THMB-NormIntensityTable-65.txt')
THMB_75.to_csv('SLine-THMB-NormIntensityTable-75.txt')
THMB_85.to_csv('SLine-THMB-NormIntensityTable-85.txt')
THMB_95.to_csv('SLine-THMB-NormIntensityTable-95.txt')


#------------------------------------------------------------------------------------

end = time.time()
print(end-start)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


98.40009903907776


In [23]:
THMB_95[THMB_95.eq(0).any(1)]

Unnamed: 0_level_0,INTENSITY_1,INTENSITY_2,INTENSITY_3,INTENSITY_4,INTENSITY_5,INTENSITY_6,INTENSITY_7,INTENSITY_8,INTENSITY_9,INTENSITY_10,INTENSITY_11,INTENSITY_12,INTENSITY_13,INTENSITY_14
ClusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1


In [43]:
start = time.time()

#---------------Add Protein Labels to R-analysis of Census SLine Data----------------

THMB_65_log_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-65.clstr', 'Census_analysis/SLine-THMB-Census-Log2Transform-65.csv')
THMB_65_log_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_65_log_R1.to_csv('SLine-THMB-Census-Log2Transform-65-annotated.txt', index=False)

THMB_75_log_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-75.clstr', 'Census_analysis/SLine-THMB-Census-Log2Transform-75.csv')
THMB_75_log_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_75_log_R1.to_csv('SLine-THMB-Census-Log2Transform-75-annotated.txt', index=False)

THMB_85_log_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-85.clstr', 'Census_analysis/SLine-THMB-Census-Log2Transform-85.csv')
THMB_85_log_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_85_log_R1.to_csv('SLine-THMB-Census-Log2Transform-85-annotated.txt', index=False)

THMB_95_log_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-95.clstr', 'Census_analysis/SLine-THMB-Census-Log2Transform-95.csv')
THMB_95_log_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_95_log_R1.to_csv('SLine-THMB-Census-Log2Transform-95-annotated.txt', index=False)


#----------

THMB_65_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-65.clstr', 'Census_analysis/SLine-THMB-Census-cuberootTransform-65.csv')
THMB_65_3r_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_65_3r_R1.to_csv('SLine-THMB-Census-cuberootTransform-65-annotated.txt', index=False)

THMB_75_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-75.clstr', 'Census_analysis/SLine-THMB-Census-cuberootTransform-75.csv')
THMB_75_3r_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_75_3r_R1.to_csv('SLine-THMB-Census-cuberootTransform-75-annotated.txt', index=False)

THMB_85_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-85.clstr', 'Census_analysis/SLine-THMB-Census-cuberootTransform-85.csv')
THMB_85_3r_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_85_3r_R1.to_csv('SLine-THMB-Census-cuberootTransform-85-annotated.txt', index=False)

THMB_95_3r_R1 = annotate_R_abridged('CDHIT/cdhitout-TH-95.clstr', 'Census_analysis/SLine-THMB-Census-cuberootTransform-95.csv')
THMB_95_3r_R1.drop(columns='Unnamed: 0', inplace=True)
THMB_95_3r_R1.to_csv('SLine-THMB-Census-cuberootTransform-95-annotated.txt', index=False)


#------------------------------------------------------------------------------------

end = time.time()
print(end-start)

2.0245611667633057


In [40]:
THMB_95

Unnamed: 0_level_0,INTENSITY_1,INTENSITY_2,INTENSITY_3,INTENSITY_4,INTENSITY_5,INTENSITY_6,INTENSITY_7,INTENSITY_8,INTENSITY_9,INTENSITY_10,INTENSITY_11,INTENSITY_12,INTENSITY_13,INTENSITY_14
ClusterID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0,5.348647e-12,2.607019e-11,5.101761e-12,2.452523e-11,2.859864e-11,2.829251e-11,2.232654e-11,2.385057e-11,3.241360e-11,1.795133e-11,8.880005e-05,4.064236e-12,2.895923e-11,4.144315e-11
1,1.698561e-04,2.607019e-11,6.232192e-05,6.246374e-05,2.859864e-11,2.829251e-11,1.025415e-04,2.385057e-11,9.803263e-05,1.795133e-11,2.273178e-11,4.064236e-12,9.526861e-05,1.066157e-04
1000,5.348647e-12,2.607019e-11,5.101761e-12,2.452523e-11,2.859864e-11,7.343355e-05,2.232654e-11,2.385057e-11,3.241360e-11,1.795133e-11,2.273178e-11,4.064236e-12,2.895923e-11,4.144315e-11
10016,5.348647e-12,2.607019e-11,5.101761e-12,2.452523e-11,2.859864e-11,2.829251e-11,2.232654e-11,2.385057e-11,3.241360e-11,1.795133e-11,1.768745e-04,4.064236e-12,2.895923e-11,4.144315e-11
10017,5.348647e-12,2.607019e-11,5.101761e-12,2.452523e-11,2.859864e-11,2.829251e-11,2.232654e-11,2.385057e-11,3.241360e-11,1.795133e-11,3.130825e-06,4.064236e-12,2.895923e-11,4.144315e-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9962,5.348647e-12,6.009833e-05,5.101761e-12,2.452523e-11,2.859864e-11,2.829251e-11,1.498991e-04,2.385057e-11,3.241360e-11,1.795133e-11,9.621056e-05,4.064236e-12,2.895923e-11,4.144315e-11
9969,5.348647e-12,2.607019e-11,5.101761e-12,2.452523e-11,2.859864e-11,2.829251e-11,2.232654e-11,2.385057e-11,3.241360e-11,1.795133e-11,2.273178e-11,4.064236e-12,2.895923e-11,1.398408e-05
9974,2.196381e-05,2.607019e-11,5.101761e-12,2.452523e-11,2.859864e-11,2.829251e-11,2.232654e-11,2.385057e-11,3.241360e-11,1.795133e-11,2.273178e-11,1.232535e-05,1.373186e-04,4.144315e-11
9981,5.348647e-12,2.607019e-11,5.101761e-12,2.083337e-05,1.063057e-04,2.829251e-11,2.232654e-11,2.090741e-05,3.241360e-11,2.759713e-05,2.273178e-11,4.064236e-12,2.895923e-11,4.144315e-11
