In [1]:
import pandas as pd
import numpy as np

In [21]:
file_path = 'data/Percentage Project Example.xlsx'
dfs = pd.read_excel(file_path, sheet_name=None)

In [6]:
dfs

{'Summary':                   Q1.What year are you in?           Unnamed: 1
 0                           Answer Choices  Response Percentage
 1                               First year               36.84%
 2                                Sophomore               33.83%
 3                                   Junior               14.29%
 4                                   Senior               12.03%
 ..                                     ...                  ...
 220  Q29.Are you an international student?                  NaN
 221                         Answer Choices  Response Percentage
 222                                    Yes               21.80%
 223                                     No               77.44%
 224                      Prefer not to say                0.75%
 
 [225 rows x 2 columns],
 'Gender':                            Gender Demographic Analysis  \
 0    * Note: Some demographic groups had less than ...   
 1     Q3.Check all of the following that apply to you

In [9]:
for sheet_name,_ in dfs.items():
    print(sheet_name)

Summary
Gender
Race&Ethnicity
Disability
LGBQ+
Transgender
First Gen
Low Income
International


In [210]:
df = pd.read_excel(file_path, sheet_name='LGBQ+')

# questions = df.iloc[0]
# response_categories = df.iloc[:, 0]
df.head()

Unnamed: 0,LGBQ+ Demographic Analysis,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,* Note: Some demographic groups had less than ...,,,,,,,
1,Q3.Check all of the following that apply to you:,,,,,,,
2,,"I feel pressure at Brown to find internships, ...",I feel confident studying computer science and...,I feel intimidated studying Computer Science a...,None of the above,,,
3,Heterosexual/straight,76.67%,61.11%,50.00%,1.11%,,,
4,LGBQ+,79.07%,41.86%,62.79%,2.33%,,,


In [217]:
def extract_data_groups(excel_path, sheet_name):
    # Read the excel file
    df = pd.read_excel(excel_path, sheet_name=sheet_name)
    
    # Dictionary to hold the extracted groups
    groups = {}
    current_key = None
    start_index = None
    
    # Iterate through the DataFrame to find rows with 'Q'
    for index, row in df.iterrows():
        # Check if any cell in the row contains 'Q'
        if row.astype(str).str.contains(r'Q\d').any():
            # If we have a previous key, extract the DataFrame slice up to the current row
            if current_key is not None:
                # Extract the group and remove rows/columns with all NaN values
                group_df = df.iloc[start_index:index].dropna(how='all').dropna(axis=1, how='all')
                groups[current_key] = group_df
            
            # Update the current key and start index for the next group
            current_key = row[row.astype(str).str.contains(r'Q\d')].values[0]  # Assuming only one 'Q' per key-row
            start_index = index + 1
    
    # Handle the last group if the last row with 'Q' isn't the last row of the DataFrame
    if current_key is not None and start_index < len(df):
        # Extract the group and remove rows/columns with all NaN values
        group_df = df.iloc[start_index:].dropna(how='all').dropna(axis=1, how='all')
        groups[current_key] = group_df
    
    return groups




In [218]:
# Uncomment to test the function
file_path = 'data/Percentage Project Example.xlsx'
data_groups = extract_data_groups(file_path,sheet_name='LGBQ+')
data_groups

{'Q3.Check all of the following that apply to you:':   LGBQ+ Demographic Analysis  \
 2                        NaN   
 3      Heterosexual/straight   
 4                      LGBQ+   
 
                                           Unnamed: 1  \
 2  I feel pressure at Brown to find internships, ...   
 3                                             76.67%   
 4                                             79.07%   
 
                                           Unnamed: 2  \
 2  I feel confident studying computer science and...   
 3                                             61.11%   
 4                                             41.86%   
 
                                           Unnamed: 3         Unnamed: 4  
 2  I feel intimidated studying Computer Science a...  None of the above  
 3                                             50.00%              1.11%  
 4                                             62.79%              2.33%  ,
 'Q4.Check all of the following that apply to you:': 

In [219]:
for key, df in data_groups.items():
    print(f"Key: {key}, DataFrame Shape: {df.shape}")

Key: Q3.Check all of the following that apply to you:, DataFrame Shape: (3, 5)
Key: Q4.Check all of the following that apply to you:, DataFrame Shape: (3, 5)
Key: Q5.Check all of the following that apply to you:, DataFrame Shape: (3, 7)
Key: Q6.Think about your time in CS classes. Check all of the following that apply to you:, DataFrame Shape: (3, 8)
Key: Q7.I have a faculty whom I perceive as a role model, DataFrame Shape: (3, 6)
Key: Q8.I have considered leaving my computer science-related field of study., DataFrame Shape: (3, 6)
Key: Q9.A faculty member or an administrator has encouraged me to drop out., DataFrame Shape: (3, 6)
Key: Q10.At my university, students from every background have an equal chance to succeed., DataFrame Shape: (3, 6)
Key: Q11.I feel adequately supported by the CS department and the resources offered by the department., DataFrame Shape: (3, 6)
Key: Q12.I  feel comfortable using Brown Computer Science physical spaces (sunlab, CIT lobby, atrium, lecture halls, 

In [220]:
data_groups['Q4.Check all of the following that apply to you:']

Unnamed: 0,LGBQ+ Demographic Analysis,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4
7,,I have experienced microaggression. A microagg...,I have been interrupted or talked to condescen...,"In a group project, my opinion is as respected...",None of the above
8,Heterosexual/straight,22.22%,43.33%,78.89%,6.67%
9,LGBQ+,41.86%,67.44%,46.51%,4.65%


In [197]:
def transform_and_transpose(df):
    # Remove the first row and reset the column headers with the next row
    new_header = df.iloc[0]  # This row will become the header.
    df = df[1:]  # Take the data less the header row
    df.columns = new_header  # Set the header row as the df header
    df = df.reset_index(drop=True)

    # Transpose the DataFrame
    df_transposed = df.T  # Transpose the DataFrame
    new_header = df_transposed.iloc[0]  # Grab the first row for the header
    df_transposed = df_transposed[1:]  # Take the data less the header row
    df_transposed.columns = new_header  # Set the header row as the df header
    df_transposed.index.name = None # Remove the index name

    return df_transposed

In [198]:
transform_and_transpose(data_groups['Q4.Check all of the following that apply to you:'])

nan,Woman,Man,Non-binary,Prefer to self-describe:,Prefer not to say
I have experienced microaggression. A microaggression is a comment that subtly and often unconsciously or unintentionally expresses a prejudiced attitude toward a member of a marginalized group.,44.07%,15.71%,*,*,*
I have been interrupted or talked to condescendingly by someone who assumed they knew more.,64.41%,38.57%,*,*,*
"In a group project, my opinion is as respected as that of other group members.",57.63%,77.14%,*,*,*
None of the above,3.39%,8.57%,*,*,*


In [199]:
for key in data_groups.keys():
    data_groups[key] = transform_and_transpose(data_groups[key])

In [200]:
def prepend_question_number_to_df(data_dict):
    # Iterate through each item in the dictionary
    for key, df in data_dict.items():
        # Extract the question number from the key
        question_number = key.split('.')[0]  # Splits on the dot and takes the first part 'Q3', 'Q4', etc.
        
        # Prepend the question number to the first column of the DataFrame
        df.insert(0, 'Question Number', question_number)  # Inserts the question number as the first column
    
    return data_dict

def combine_question_number(df):
    df.reset_index(inplace=True)
    df['Questions'] = df['Question Number'] + '. ' + df['index']
        
    df.drop(['Question Number', 'index'], axis=1, inplace=True)
    return df

In [201]:
process_data = data_groups.copy()
new = prepend_question_number_to_df(data_groups)

In [202]:
for key in new.keys():
    new[key] = combine_question_number(new[key])

In [193]:
combine_question_number(new['Q4.Check all of the following that apply to you:'])

nan,Woman,Man,Non-binary,Prefer to self-describe:,Prefer not to say,Questions
0,44.07%,15.71%,*,*,*,Q4. I have experienced microaggression. A micr...
1,64.41%,38.57%,*,*,*,Q4. I have been interrupted or talked to conde...
2,57.63%,77.14%,*,*,*,"Q4. In a group project, my opinion is as respe..."
3,3.39%,8.57%,*,*,*,Q4. None of the above


In [204]:
# Assuming 'data_dict' is your dictionary with DataFrames
dataframes_list = list(new.values())  # Collect all DataFrames from the dictionary

# Concatenate all DataFrames vertically
combined_dataframe = pd.concat(dataframes_list, axis=0, ignore_index=True)
combined_dataframe

nan,Woman,Man,Non-binary,Prefer to self-describe:,Prefer not to say,Questions
0,86.44%,68.57%,*,*,*,Q3. I feel pressure at Brown to find internshi...
1,30.51%,75.71%,*,*,*,Q3. I feel confident studying computer science...
2,69.49%,41.43%,*,*,*,Q3. I feel intimidated studying Computer Scien...
3,1.69%,1.43%,*,*,*,Q3. None of the above
4,44.07%,15.71%,*,*,*,Q4. I have experienced microaggression. A micr...
...,...,...,...,...,...,...
87,78.43%,87.93%,*,*,*,Q20. No
88,1.96%,5.17%,*,*,*,Q20. Maybe
89,66.67%,43.10%,*,*,*,Q21. Yes
90,29.41%,37.93%,*,*,*,Q21. No


In [27]:
all_data = []
# df = pd.read_excel(file_path, sheet_name=sheet_name)
df.columns = df.iloc[0]  # Set the first row as header
df = df.drop(0).reset_index(drop=True)  # Drop the header row from the data

# Melt the DataFrame to long format
melted_df = df.melt(id_vars=df.columns[0], var_name='Demographic', value_name='Value')
melted_df.rename(columns={df.columns[0]: 'Question'}, inplace=True)
all_data.append(melted_df)
all_data

[                                              Question Demographic  \
 0     Q3.Check all of the following that apply to you:         NaN   
 1                                                  NaN         NaN   
 2                                                Woman         NaN   
 3                                                  Man         NaN   
 4                                           Non-binary         NaN   
 ...                                                ...         ...   
 1052                                             Woman         NaN   
 1053                                               Man         NaN   
 1054                                        Non-binary         NaN   
 1055                          Prefer to self-describe:         NaN   
 1056                                 Prefer not to say         NaN   
 
                                                   Value  
 0                                                   NaN  
 1     I feel pressure at Br

In [13]:
df

Unnamed: 0,Gender Demographic Analysis,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,* Note: Some demographic groups had less than ...,,,,,,,
1,Q3.Check all of the following that apply to you:,,,,,,,
2,,"I feel pressure at Brown to find internships, ...",I feel confident studying computer science and...,I feel intimidated studying Computer Science a...,None of the above,,,
3,Woman,86.44%,30.51%,69.49%,1.69%,,,
4,Man,68.57%,75.71%,41.43%,1.43%,,,
...,...,...,...,...,...,...,...,...
147,Woman,66.67%,29.41%,3.92%,,,,
148,Man,43.10%,37.93%,18.97%,,,,
149,Non-binary,*,*,*,,,,
150,Prefer to self-describe:,*,*,*,,,,


In [205]:
import pandas as pd

# Path to your Excel file
excel_path = 'data/Percentage Project Example.xlsx'

# Load the Excel file to list all sheet names
xls = pd.ExcelFile(excel_path)
all_sheets = xls.sheet_names

# Filter out the 'Summary' sheet
sheets_to_read = [sheet for sheet in all_sheets if sheet != 'Summary']

# Read all sheets except 'Summary'
data_dict = pd.read_excel(excel_path, sheet_name=sheets_to_read)

# Print the names of the sheets read to verify
print("Sheets read:", data_dict.keys())


Sheets read: dict_keys(['Gender', 'Race&Ethnicity', 'Disability', 'LGBQ+', 'Transgender', 'First Gen', 'Low Income', 'International'])


In [269]:
# create an exmpty dictionary to store dataframe from each sheet with the corersponding sheet name
dict_all_sheets = {}
for sheet_name in sheets_to_read:
    print(f'----{sheet_name}----')
    # read in one sheet at a time
    data_groups = extract_data_groups(excel_path, sheet_name)

    # transform the data in this sheet
    for key in data_groups.keys():
        data_groups[key] = transform_and_transpose(data_groups[key])

    # assign question number to each option
    dict_df = data_groups.copy()
    dict_df = prepend_question_number_to_df(dict_df)

    # for each question, a desired dataframe is done
    for key in dict_df.keys():
        dict_df[key] = combine_question_number(dict_df[key])
    
    # collect all DataFrames from the dictionary
    dataframes_list = list(dict_df.values())  

    # concat all dataframes vertically
    combined_df = pd.concat(dataframes_list, axis=0, ignore_index=True)
    questoin_df = combined_df[['Questions']]
    combined_df = combined_df.drop(columns=['Questions'])
    print(combined_df.shape)

    # save the processed dataframe with the corresponding sheet name
    dict_all_sheets[sheet_name] = combined_df

----Gender----
(92, 5)
----Race&Ethnicity----
(92, 10)
----Disability----
(92, 3)
----LGBQ+----
(92, 2)
----Transgender----
(92, 3)
----First Gen----
(92, 3)
----Low Income----
(92, 3)
----International----
(92, 3)


In [260]:
# dict_all_sheets

In [270]:
# Add a new level to the columns
new_level = ''
questoin_df.columns = pd.MultiIndex.from_tuples([(new_level, col) for col in questoin_df.columns])
questoin_df.reset_index(inplace=True)
questoin_df

Unnamed: 0_level_0,index,Unnamed: 2_level_0
Unnamed: 0_level_1,Unnamed: 1_level_1,Questions
0,0,Q3. I feel pressure at Brown to find internshi...
1,1,Q3. I feel confident studying computer science...
2,2,Q3. I feel intimidated studying Computer Scien...
3,3,Q3. None of the above
4,4,Q4. I have experienced microaggression. A micr...
...,...,...
87,87,Q20. No
88,88,Q20. Maybe
89,89,Q21. Yes
90,90,Q21. No


In [278]:
res_df = pd.concat(dict_all_sheets.values(), axis=1, keys=dict_all_sheets.keys())
res_df.reset_index(inplace=True)
res_df

Unnamed: 0_level_0,index,Gender,Gender,Gender,Gender,Gender,Race&Ethnicity,Race&Ethnicity,Race&Ethnicity,Race&Ethnicity,...,Transgender,First Gen,First Gen,First Gen,Low Income,Low Income,Low Income,International,International,International
Unnamed: 0_level_1,Unnamed: 1_level_1,Woman,Man,Non-binary,Prefer to self-describe:,Prefer not to say,African American/Black,Asian,Hispanic/Latinx,Middle Eastern/North African (MENA),...,Prefer not to say,First-generation student,Not first-generation student,Prefer not to say,Low income student,Not low income student,Prefer not to say,International student,Not an international student,Prefer not to say
0,0,86.44%,68.57%,*,*,*,57.14%,85.00%,*,*,...,*,76.47%,77.39%,*,68.00%,79.44%,*,68.97%,79.61%,*
1,1,30.51%,75.71%,*,*,*,28.57%,45.00%,*,*,...,*,35.29%,58.26%,*,44.00%,57.94%,*,51.72%,56.31%,*
2,2,69.49%,41.43%,*,*,*,71.43%,60.00%,*,*,...,*,76.47%,51.30%,*,48.00%,56.07%,*,48.28%,56.31%,*
3,3,1.69%,1.43%,*,*,*,0.00%,5.00%,*,*,...,*,0.00%,1.74%,*,0.00%,1.87%,*,3.45%,0.97%,*
4,4,44.07%,15.71%,*,*,*,42.86%,37.50%,*,*,...,*,35.29%,27.83%,*,36.00%,27.10%,*,27.59%,29.13%,*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,87,78.43%,87.93%,*,*,*,*,91.18%,*,*,...,*,78.57%,84.69%,*,85.00%,83.70%,*,61.90%,89.01%,*
88,88,1.96%,5.17%,*,*,*,*,2.94%,*,*,...,*,7.14%,3.06%,*,10.00%,2.17%,*,4.76%,3.30%,*
89,89,66.67%,43.10%,*,*,*,*,55.88%,*,*,...,*,64.29%,53.06%,*,50.00%,55.43%,*,57.14%,53.85%,*
90,90,29.41%,37.93%,*,*,*,*,41.18%,*,*,...,*,35.71%,33.67%,*,45.00%,31.52%,*,33.33%,34.07%,*


In [272]:
final_df = questoin_df.merge(res_df, how='left', on='index')
final_df

  final_df = questoin_df.merge(res_df, how='left', on='index')


Unnamed: 0_level_0,index,Unnamed: 2_level_0,Gender,Gender,Gender,Gender,Gender,Race&Ethnicity,Race&Ethnicity,Race&Ethnicity,...,Transgender,First Gen,First Gen,First Gen,Low Income,Low Income,Low Income,International,International,International
Unnamed: 0_level_1,Unnamed: 1_level_1,Questions,Woman,Man,Non-binary,Prefer to self-describe:,Prefer not to say,African American/Black,Asian,Hispanic/Latinx,...,Prefer not to say,First-generation student,Not first-generation student,Prefer not to say,Low income student,Not low income student,Prefer not to say,International student,Not an international student,Prefer not to say
0,0,Q3. I feel pressure at Brown to find internshi...,86.44%,68.57%,*,*,*,57.14%,85.00%,*,...,*,76.47%,77.39%,*,68.00%,79.44%,*,68.97%,79.61%,*
1,1,Q3. I feel confident studying computer science...,30.51%,75.71%,*,*,*,28.57%,45.00%,*,...,*,35.29%,58.26%,*,44.00%,57.94%,*,51.72%,56.31%,*
2,2,Q3. I feel intimidated studying Computer Scien...,69.49%,41.43%,*,*,*,71.43%,60.00%,*,...,*,76.47%,51.30%,*,48.00%,56.07%,*,48.28%,56.31%,*
3,3,Q3. None of the above,1.69%,1.43%,*,*,*,0.00%,5.00%,*,...,*,0.00%,1.74%,*,0.00%,1.87%,*,3.45%,0.97%,*
4,4,Q4. I have experienced microaggression. A micr...,44.07%,15.71%,*,*,*,42.86%,37.50%,*,...,*,35.29%,27.83%,*,36.00%,27.10%,*,27.59%,29.13%,*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,87,Q20. No,78.43%,87.93%,*,*,*,*,91.18%,*,...,*,78.57%,84.69%,*,85.00%,83.70%,*,61.90%,89.01%,*
88,88,Q20. Maybe,1.96%,5.17%,*,*,*,*,2.94%,*,...,*,7.14%,3.06%,*,10.00%,2.17%,*,4.76%,3.30%,*
89,89,Q21. Yes,66.67%,43.10%,*,*,*,*,55.88%,*,...,*,64.29%,53.06%,*,50.00%,55.43%,*,57.14%,53.85%,*
90,90,Q21. No,29.41%,37.93%,*,*,*,*,41.18%,*,...,*,35.71%,33.67%,*,45.00%,31.52%,*,33.33%,34.07%,*


In [276]:
final_df.drop(columns=[('index','')])

Unnamed: 0_level_0,Unnamed: 1_level_0,Gender,Gender,Gender,Gender,Gender,Race&Ethnicity,Race&Ethnicity,Race&Ethnicity,Race&Ethnicity,...,Transgender,First Gen,First Gen,First Gen,Low Income,Low Income,Low Income,International,International,International
Unnamed: 0_level_1,Questions,Woman,Man,Non-binary,Prefer to self-describe:,Prefer not to say,African American/Black,Asian,Hispanic/Latinx,Middle Eastern/North African (MENA),...,Prefer not to say,First-generation student,Not first-generation student,Prefer not to say,Low income student,Not low income student,Prefer not to say,International student,Not an international student,Prefer not to say
0,Q3. I feel pressure at Brown to find internshi...,86.44%,68.57%,*,*,*,57.14%,85.00%,*,*,...,*,76.47%,77.39%,*,68.00%,79.44%,*,68.97%,79.61%,*
1,Q3. I feel confident studying computer science...,30.51%,75.71%,*,*,*,28.57%,45.00%,*,*,...,*,35.29%,58.26%,*,44.00%,57.94%,*,51.72%,56.31%,*
2,Q3. I feel intimidated studying Computer Scien...,69.49%,41.43%,*,*,*,71.43%,60.00%,*,*,...,*,76.47%,51.30%,*,48.00%,56.07%,*,48.28%,56.31%,*
3,Q3. None of the above,1.69%,1.43%,*,*,*,0.00%,5.00%,*,*,...,*,0.00%,1.74%,*,0.00%,1.87%,*,3.45%,0.97%,*
4,Q4. I have experienced microaggression. A micr...,44.07%,15.71%,*,*,*,42.86%,37.50%,*,*,...,*,35.29%,27.83%,*,36.00%,27.10%,*,27.59%,29.13%,*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,Q20. No,78.43%,87.93%,*,*,*,*,91.18%,*,*,...,*,78.57%,84.69%,*,85.00%,83.70%,*,61.90%,89.01%,*
88,Q20. Maybe,1.96%,5.17%,*,*,*,*,2.94%,*,*,...,*,7.14%,3.06%,*,10.00%,2.17%,*,4.76%,3.30%,*
89,Q21. Yes,66.67%,43.10%,*,*,*,*,55.88%,*,*,...,*,64.29%,53.06%,*,50.00%,55.43%,*,57.14%,53.85%,*
90,Q21. No,29.41%,37.93%,*,*,*,*,41.18%,*,*,...,*,35.71%,33.67%,*,45.00%,31.52%,*,33.33%,34.07%,*


In [235]:
# Function to remove duplicate columns
def remove_duplicate_columns(df):
    # Store columns to keep
    columns_to_keep = []
    seen_columns = set()

    for col in df.columns:
        if col[1] not in seen_columns:
            columns_to_keep.append(col)
            seen_columns.add(col[1])

    return df.loc[:, columns_to_keep]

# Apply the function to remove duplicate columns
cleaned_df = remove_duplicate_columns(res_df)

# Print the resulting DataFrame
cleaned_df

Unnamed: 0_level_0,Gender,Gender,Gender,Gender,Gender,Gender,Race&Ethnicity,Race&Ethnicity,Race&Ethnicity,Race&Ethnicity,...,LGBQ+,LGBQ+,Transgender,Transgender,First Gen,First Gen,Low Income,Low Income,International,International
Unnamed: 0_level_1,Woman,Man,Non-binary,Prefer to self-describe:,Prefer not to say,Questions,African American/Black,Asian,Hispanic/Latinx,Middle Eastern/North African (MENA),...,Heterosexual/straight,LGBQ+,Transgender,Cisgender,First-generation student,Not first-generation student,Low income student,Not low income student,International student,Not an international student
0,86.44%,68.57%,*,*,*,Q3. I feel pressure at Brown to find internshi...,57.14%,85.00%,*,*,...,76.67%,79.07%,*,76.92%,76.47%,77.39%,68.00%,79.44%,68.97%,79.61%
1,30.51%,75.71%,*,*,*,Q3. I feel confident studying computer science...,28.57%,45.00%,*,*,...,61.11%,41.86%,*,55.38%,35.29%,58.26%,44.00%,57.94%,51.72%,56.31%
2,69.49%,41.43%,*,*,*,Q3. I feel intimidated studying Computer Scien...,71.43%,60.00%,*,*,...,50.00%,62.79%,*,53.85%,76.47%,51.30%,48.00%,56.07%,48.28%,56.31%
3,1.69%,1.43%,*,*,*,Q3. None of the above,0.00%,5.00%,*,*,...,1.11%,2.33%,*,1.54%,0.00%,1.74%,0.00%,1.87%,3.45%,0.97%
4,44.07%,15.71%,*,*,*,Q4. I have experienced microaggression. A micr...,42.86%,37.50%,*,*,...,22.22%,41.86%,*,28.46%,35.29%,27.83%,36.00%,27.10%,27.59%,29.13%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,78.43%,87.93%,*,*,*,Q20. No,*,91.18%,*,*,...,91.89%,68.42%,*,83.64%,78.57%,84.69%,85.00%,83.70%,61.90%,89.01%
88,1.96%,5.17%,*,*,*,Q20. Maybe,*,2.94%,*,*,...,0.00%,10.53%,*,3.64%,7.14%,3.06%,10.00%,2.17%,4.76%,3.30%
89,66.67%,43.10%,*,*,*,Q21. Yes,*,55.88%,*,*,...,44.59%,73.68%,*,54.55%,64.29%,53.06%,50.00%,55.43%,57.14%,53.85%
90,29.41%,37.93%,*,*,*,Q21. No,*,41.18%,*,*,...,43.24%,15.79%,*,33.64%,35.71%,33.67%,45.00%,31.52%,33.33%,34.07%


In [236]:
# Function to merge duplicate columns
def merge_duplicate_columns(df):
    # Create a dictionary to hold merged columns
    merged_columns = {}

    for col in df.columns:
        # Convert column data to a tuple (to make it hashable)
        col_data = tuple(df[col])
        
        if col_data in merged_columns:
            # Merge with existing column if data is identical
            merged_columns[col_data].append(col)
        else:
            # Otherwise, start a new list for this column data
            merged_columns[col_data] = [col]
    
    # Create a new DataFrame with merged columns
    merged_df = pd.DataFrame()
    for col_data, col_list in merged_columns.items():
        # Use the first column name in the list as the representative name
        merged_col_name = col_list[0]
        merged_df[merged_col_name] = df[col_list[0]]

    return merged_df

# Apply the function to the combined DataFrame
merged_df = merge_duplicate_columns(res_df)
merged_df

Unnamed: 0,"(Gender, Woman)","(Gender, Man)","(Gender, Non-binary)","(Gender, Questions)","(Race&Ethnicity, African American/Black)","(Race&Ethnicity, Asian)","(Race&Ethnicity, White)","(Race&Ethnicity, Mixed)","(Disability, Person with disability)","(Disability, Person without diability)","(LGBQ+, Heterosexual/straight)","(LGBQ+, LGBQ+)","(Transgender, Cisgender)","(First Gen, First-generation student)","(First Gen, Not first-generation student)","(Low Income, Low income student)","(Low Income, Not low income student)","(International, International student)","(International, Not an international student)"
0,86.44%,68.57%,*,Q3. I feel pressure at Brown to find internshi...,57.14%,85.00%,74.47%,76.92%,90.00%,75.63%,76.67%,79.07%,76.92%,76.47%,77.39%,68.00%,79.44%,68.97%,79.61%
1,30.51%,75.71%,*,Q3. I feel confident studying computer science...,28.57%,45.00%,68.09%,53.85%,40.00%,57.98%,61.11%,41.86%,55.38%,35.29%,58.26%,44.00%,57.94%,51.72%,56.31%
2,69.49%,41.43%,*,Q3. I feel intimidated studying Computer Scien...,71.43%,60.00%,48.94%,53.85%,80.00%,51.26%,50.00%,62.79%,53.85%,76.47%,51.30%,48.00%,56.07%,48.28%,56.31%
3,1.69%,1.43%,*,Q3. None of the above,0.00%,5.00%,0.00%,0.00%,0.00%,1.68%,1.11%,2.33%,1.54%,0.00%,1.74%,0.00%,1.87%,3.45%,0.97%
4,44.07%,15.71%,*,Q4. I have experienced microaggression. A micr...,42.86%,37.50%,21.28%,19.23%,60.00%,25.21%,22.22%,41.86%,28.46%,35.29%,27.83%,36.00%,27.10%,27.59%,29.13%
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87,78.43%,87.93%,*,Q20. No,*,91.18%,85.00%,75.00%,88.89%,84.00%,91.89%,68.42%,83.64%,78.57%,84.69%,85.00%,83.70%,61.90%,89.01%
88,1.96%,5.17%,*,Q20. Maybe,*,2.94%,0.00%,12.50%,0.00%,4.00%,0.00%,10.53%,3.64%,7.14%,3.06%,10.00%,2.17%,4.76%,3.30%
89,66.67%,43.10%,*,Q21. Yes,*,55.88%,55.00%,54.17%,55.56%,54.00%,44.59%,73.68%,54.55%,64.29%,53.06%,50.00%,55.43%,57.14%,53.85%
90,29.41%,37.93%,*,Q21. No,*,41.18%,27.50%,25.00%,44.44%,33.00%,43.24%,15.79%,33.64%,35.71%,33.67%,45.00%,31.52%,33.33%,34.07%


In [231]:
# Combine DataFrames horizontally
res_df = pd.concat(dict_all_sheets.values(), axis=1)

# Optionally, add multi-level column headers to distinguish different DataFrames
res_df.columns = pd.MultiIndex.from_product([dict_all_sheets.keys(), res_df.columns])

res_df


ValueError: Length mismatch: Expected axis has 40 elements, new values have 320 elements

In [None]:
# Function to merge duplicate columns
def merge_duplicate_columns(df):
    # Create a dictionary to hold merged columns
    merged_columns = {}

    for col in df.columns:
        # Convert column data to a tuple (to make it hashable)
        col_data = tuple(df[col])
        
        if col_data in merged_columns:
            # Merge with existing column if data is identical
            merged_columns[col_data].append(col)
        else:
            # Otherwise, start a new list for this column data
            merged_columns[col_data] = [col]
    
    # Create a new DataFrame with merged columns
    merged_df = pd.DataFrame()
    for col_data, col_list in merged_columns.items():
        # Use the first column name in the list as the representative name
        merged_col_name = col_list[0]
        merged_df[merged_col_name] = df[col_list[0]]

    return merged_df

# Apply the function to the combined DataFrame
merged_df = merge_duplicate_columns(res_df)

# To drop duplicate 'Questions' columns, we can filter the columns explicitly
# Keep only the first 'Questions' column and drop the rest
if 'Questions' in merged_df.columns:
    # Get all columns except 'Questions'
    cols_to_keep = [col for col in merged_df.columns if col != 'Questions']
    # Add only the first occurrence of 'Questions'
    cols_to_keep.insert(0, 'Questions')
    # Select these columns to form the final DataFrame
    merged_df = merged_df[cols_to_keep]