In [1]:
# Need to import csv to make it easier, using integrated module functions
import csv

# Create first function to process Gene_Exp Matrix,
# this will be used for both varieties x and y
def create_gene_expression_matrix(file):
    reader = csv.reader(file)
    header = next(reader)

   #Extract control and stress columns based on header names
    control_columns = [col for col in header if 'C' in col and 'Rep' in col]
    stress_columns = [col for col in header if ('X1' in col or 'Y1' in col) and 'Rep' in col]

    new_header = []
    
    # print("Control Columns:", control_columns)
    # print("Stress Columns:", stress_columns)

    #define data to store the stress and control vars 
    mat_data = []

    #now we can for each row
    #define the names (gene_name, control, and stress vals)
    #we search for the vals by using the index in the control and stress columns
    for row in reader:
        gene_name = row[0]
        control_values = [row[header.index(col)] for col in control_columns]
        stress_values = [row[header.index(col)] for col in stress_columns]


        #now we can add the new rows with the updated values and headers
        
        new_row = [gene_name] + control_values + stress_values
        mat_data.append(new_row)
        new_header = ['gene_name'] + control_columns + stress_columns
        
        # for some reason, in the GEM_X file 
        #the varXC has a dash for rep.1 
        #so we need to get rid of the dash
        #this section of code searches the columns for 
        #VarXC-Rep.1 and replaces it with VarXCRep.1
        
        for i, col in enumerate(new_header):
            if col == 'VarXC-Rep.1':
                new_header[i] = 'VarXCRep.1'
    
      
    return new_header, mat_data
    


#define the file names as vars for each variety
varX_file = 'GEM_X.csv'
varY_file = 'GEM_Y.csv'

#then we can open the related files 
with open(varX_file, 'r') as varX_file, open(varY_file, 'r') as varY_file:
    # Use functions to create the data for each variety
    varX_header, varX_gem_data = create_gene_expression_matrix(varX_file)
    varY_header, varY_gem_data = create_gene_expression_matrix(varY_file)

#next we need to define a function to save the file using data processed above
#so we can easily save files, which is used in multiple instances in this script

def save_to_csv(filename, header, data):
    with open(filename, 'w', newline='') as file:
        writer = csv.writer(file)
        # Write the header row
        writer.writerow(header)
        # Write the data rows
        writer.writerows(data)
    
#use save function to save the file name
save_to_csv('all_VarX_TwoTimePoints.csv', varX_header, varX_gem_data)
save_to_csv('all_VarY_TwoTimePoints.csv', varY_header, varY_gem_data)


#----------------------------------------------------------------------------------------------------------------------------------------------------
# Now we need to work on the diff expressed genes 

# first create function to process differentially expressed genes data using deg file as an input

def create_deg_genes(deg_genes_file):

    #now we will need to open the diff expressed genes file

    with open(deg_genes_file, 'r') as file:
        reader = csv.DictReader(file)
    
        # define the filtered columns that we want to extract
        filt_columns = ['gene', 'log2FoldChange', 'padj', 'Athaliana_geneID', 'Gene_Function']
        
        # create empty list for diff exp data
        DEG_data = []
        
        #we will need to extract rows from the filtered columns defined above
        #and add thenm to the empty DEG_data
        for row in reader:

            #define new row as the rows in the filtered columns
            #for each column or elem in the filtered columns
            new_row = [row[column] for column in filt_columns]
            
            # we can append the DEG data using the new row var
            DEG_data.append(new_row)

            #print(filt_columns)
            
    return filt_columns, DEG_data

#define the file names as vars for each variety
varX_de_genes_file = 'Leaf_DEGs_VarX_T1.csv'
varY_de_genes_file = 'Leaf_DEGs_VarY_T1.csv'

# call function to create the differentially exp genes for each variety
varX_filt_columns, varX_deg_data = create_deg_genes(varX_de_genes_file)
varY_filt_columns, varY_deg_data = create_deg_genes(varY_de_genes_file)

#we can now combine the gene exp data with the diff exp data 
#by using a function with the input of both data
save_to_csv('VarX_deg_data.csv', varX_filt_columns, varX_deg_data)

    
def comb_data(gene_exp_data, deg_genes_data):
    deg_headers = ['gene_name', 'log2FoldChange', 'padj', 'Athaliana_geneID', 'Gene_Function']
    gem_headers = ['control_replicate_{}'.format(i) for i in range(1, 4)] + ['stress_replicate_{}'.format(i) for i in range(1, 4)]

    # Combine headers directly into a single list
    comb_headers = deg_headers + gem_headers

    comb_data = [comb_headers]
    
    # Append rows of combined data
    for gem_row, deg_row in zip(gene_exp_data, deg_genes_data):
        new_row = deg_row + gem_row[1:]  # Combine deg and gem data, excluding gene_name from gem_row
        comb_data.append(new_row)
  
    return comb_data

    

#now we can call the comb_data function and save each variety file
varX_combined_data = comb_data(varX_gem_data, varX_deg_data)
save_to_csv('Leaf_DEGs_VarX.csv', varX_combined_data[0], varX_combined_data)  

varY_combined_data = comb_data(varY_gem_data, varY_deg_data)
save_to_csv('Leaf_DEGs_VarY.csv', varY_combined_data[0], varY_combined_data)