In [2]:
import os
import sys
import csv
import scipy as sp
import numpy as np
import pandas as pd
from os import listdir
from os import getcwd
from os.path import isfile, join


#Specifiy the path to the raw .csv data files relative to the foler containing this code file
mypath = "./DaddyData"

#create a list of the .csv file names to run a for loop over each of the data files
list_files = [f for f in listdir(mypath) if f.endswith(".csv") if isfile(join(mypath, f))]
for a, i in enumerate(list_files):
    
    #print a message saying which file is being sent through the loop and read the data file in
    print('Printing ' + list_files[a].split('.')[0] + ' to  new .csv in Biocomputing final project/Top10_Individual as Top10_' + list_files[a] + '...')
    data = pd.read_csv(mypath + '/' + list_files[a], sep=',', names = ["CDR3", "TRAV", "TRAJ", "Frequency"])
    
    #Sort rows in the dataframe based on descending Frequencies of CDR3s
    data.sort_values(['Frequency'], ascending = False)

    #total the frequencies (reads) for calculating percentage later at the end of the for loop
    total_reads = 0
    for index, row in data.iterrows():
        total_reads = total_reads + data.loc[index, 'Frequency']

    #Make 2 new dataframes: top10 = top 10 rows (CDR3s) from the ordered data; exclude_top10 = dataframe with all the rest of the CDR3s
    top10 = data[0:10]
    exclude_top10 = data[11:len(data)]

    #Define Levenshtein Distance as the minimal number of insertions, deletions, and substitutions of one character for another that will transform one string into the other
    #http://www.cs.tufts.edu/comp/150GEN/classpages/Levenshtein.html
    #https://stackoverflow.com/questions/2460177/edit-distance-in-python
    def levenshteinDistance(s1, s2):
        if len(s1) > len(s2):
            s1, s2 = s2, s1
        distances = range(len(s1) + 1)
        for i2, c2 in enumerate(s2):
            distances_ = [i2+1]
            for i1, c1 in enumerate(s1):
                if c1 == c2:
                    distances_.append(distances[i1])
                else:
                    distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
            distances = distances_
        return distances[-1]

    #Define a function to iterate each of the top 10 CDR3s with the others in the Top10 list of CDR3s
    #This will ensure that there are no 'highly similar' CDR3s within the Top10
    #if there are similar CDR3s, it will add the frequency (as Corr Freq) to the datframe
    Corr_Freq = []
    del_indexes = []
    def Correct_top10 (s = top10.loc[0, 'TRAV'], t = top10.loc[0, 'TRAJ'], u = top10.loc[0, 'CDR3'], p = top10.loc[0, 'Frequency']):
        Corr_Freq_n = p
        for index, row in top10.iterrows():
            if row['TRAV'] == s and row['TRAJ'] == t:
                x = levenshteinDistance(u, row['CDR3'])
                if x != 0 and x <= 4:
                    Corr_Freq_n = Corr_Freq_n + row['Frequency']
                    del_indexes.append(index)
        Corr_Freq.append(Corr_Freq_n)
        Corr_Freq_n = 0

    #This will delete the row with a highly similar sequence so that it is not part of the iterated calculation with the rest of the CDR3s not in the Top10
    for i in range(10):
        Correct_top10 (s = top10.loc[i, 'TRAV'], t = top10.loc[i, 'TRAJ'], u = top10.loc[i, 'CDR3'], p = top10.loc[i, 'Frequency'])
    k = 0
    while k < (len(del_indexes) / 2):
        top10.drop(del_indexes[k], inplace=True)
        del Corr_Freq[del_indexes[k]]
        k = (k + 1)

    #Add the list to the top10 dataframe as a new column
    top10['Corr Freq'] = Corr_Freq

    #make a list of indexes remaining in top10 after correcting for similarity between CDR3s within top10
    available_index = []
    for index, row in top10.iterrows():
        available_index.append(index)

    #define a function 'Normalize' that finds similar CDR3s in exclude_top10 to a given input CDR3 from top10, adds the similar frequencies to the top10 frequency, then appends the result to a list
    normFreq_all = []
    normFreq_n=0
    def Normalize (x = top10.loc[0, 'TRAV'], y = top10.loc[0, 'TRAJ'], z = top10.loc[0, 'CDR3'], w = top10.loc[0, 'Corr Freq']):
        normFreq_n = w 
        for index, row in exclude_top10.iterrows():
            if row['TRAV'] == x and row['TRAJ'] ==  y:
                q = levenshteinDistance(top10.loc[0, 'CDR3'], row['CDR3'])
                if q <= 2:
                    normFreq_n = normFreq_n + row['Frequency']
        normFreq_all.append(normFreq_n)
        normFreq_n = 0

    #create a list of the normalized frequencies for each sample in top10 using the function Normalize
    for i in available_index:
        Normalize(x = top10.loc[i, 'TRAV'], y = top10.loc[i, 'TRAJ'], z = top10.loc[i, 'CDR3'], w = top10.loc[i, 'Corr Freq'])

    #Add the list to the top10 dataframe as a new column
    top10['Norm Freq'] = normFreq_all

    #Define a function to calculate the percentage of the normalized frequency within the whole repertoire, then iterate by remaining rows in top10, then append column to top10 dataframe
    Perc_of_Rep = []
    def PercentRep (a = top10.loc[0, 'Norm Freq']):
        b = (a / total_reads) * 100
        Perc_of_Rep.append(b)
    for i in available_index:
        PercentRep (a = top10.loc[i, 'Norm Freq'])
    top10['Percent of Rep'] = Perc_of_Rep
    
    #Make a new folder to store the output .csv files
    newpath = './Top_10_Individual_Mice' 
    if not os.path.exists(newpath):
        os.makedirs(newpath)
    
    #Print completed dataframe to a new .csv file in Top_10_Individual_Mice folder for further crossreference in next code file, continue for loop
    top10.to_csv('./Top_10_Individual_Mice/Top10_' + list_files[a], sep = ',', encoding = 'utf-8', index = False)
    print('Completed ' + list_files[a].split('.')[0])
    
print("Completed all fi")
    
    #The new corrected csv files for each raw data file will be named: Top10_Mouse_##.csv

Printing Mouse_31 to  new .csv in Biocomputing final project/Top10_Individual as Top10_Mouse_31.csv...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Completed Mouse_31
Printing Mouse_32 to  new .csv in Biocomputing final project/Top10_Individual as Top10_Mouse_32.csv...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Completed Mouse_32
Printing Mouse_33 to  new .csv in Biocomputing final project/Top10_Individual as Top10_Mouse_33.csv...
Completed Mouse_33
Printing Mouse_34 to  new .csv in Biocomputing final project/Top10_Individual as Top10_Mouse_34.csv...
Completed Mouse_34
Printing Mouse_35 to  new .csv in Biocomputing final project/Top10_Individual as Top10_Mouse_35.csv...
Completed Mouse_35
Printing Mouse_36 to  new .csv in Biocomputing final project/Top10_Individual as Top10_Mouse_36.csv...
Completed Mouse_36
