In [1]:
import pandas as pd

In [2]:
input_filename = 'naba_data.csv'
txt_col = "Copy/ PasteÂ An Essay Response of 500 words or less (copy and paste in webform) using the following prompt: Community disruptions such as Covid-19 and other natural disasters can have deep lasting impacts. Discuss a challenge or barrier you have overcome during the Covid-19 pandemic."

dropped_filename = './output_files/naba_data_dropped_rows.csv'
removed_duplicates_filename = './output_files/naba_data_removed_duplicates.csv'


## Clean Data

In [3]:
df = pd.read_csv(input_filename)
df = df.reset_index().rename(columns={'index': 'app_id'}) # add a column for application id, which is just the row number upon initial load
df

Unnamed: 0,app_id,Gender:* Required fields are indicated with red symbol Permanent Contact Information,City (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,State (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,Zip (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,"Are you Black? (includes African, African American, Caribbean, etc.)",Preferred Mailing Address,College/University:Academic Profile,Classification (as of January 2022):Academic Profile,Major:Academic Profile,...,"Please provide details (i.e. company name, location, etc.)",I have accepted an internship for the summer of 2022 (June - August),I have accepted an internship for the fall of 2022 (September- December),"Please provide details (i.e. company name, location, etc.).1",I have accepted a permanent job offer,"Please provide details (i.e. company name, location, etc.).2",Have you received a CPA Exam Review?,Which CPA Exam Review have you received?,Copy/ PasteÂ An Essay Response of 500 words or less (copy and paste in webform) using the following prompt: Community disruptions such as Covid-19 and other natural disasters can have deep lasting impacts. Discuss a challenge or barrier you have overcome during the Covid-19 pandemic.,Recipient
0,0,Male,Brooklyn,NY,11216,Yes,Permanent,Medgar Evers College,Senior,Accounting,...,,,,,No,,No,,During the early parts of 2020 one of the dead...,
1,1,Female,Bowie,MD,20720,Yes,Permanent,University of Maryland,Freshman,Business Management,...,Received internship as a Summer 2022 Discovery...,Yes,No,,No,,No,,One of the most challenging times has been dur...,
2,2,Female,Chicago Heights,IL,60411,Yes,Permanent,North Carolina A&T State University,Freshman,Accounting,...,,No,No,,No,,No,,When covid restrictions took place I was just ...,
3,3,Female,Chicago,IL,60620,Yes,Permanent,Loyola University Chicago,Sophomore,Accounting,...,"Ernst & Young, Chicago, Summer 2022",Yes,No,"Ernst & Young, Chicago, Summer 2022",No,,No,,"When the Covid-19 pandemic first began, I was ...",
4,4,Male,Baton Rouge,LA,70806,Yes,Campus/Temporary,Penn State University,Sophomore,Accounting,...,,,,,No,,No,,Community disruptions such as Covid-19 and oth...,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,200,Female,Brooklyn,NY,11210,Yes,Permanent,"University at Albany, SUNY",Junior,Business Administration,...,"JPMorgan Chase & Co., NYC",Yes,No,"JPMorgan Chase & Co., NYC",No,,No,,"My name is Wunmi Surakat, a current 1st semest...",
201,201,Female,Brooklyn,NY,11212,Yes,Permanent,Medgar Evers College,Junior,Accounting,...,,,,,No,,No,,No one expected the Covid- 19 pandemic. Since ...,
202,202,Female,Valdosta,GA,31601,Yes,Permanent,Valdosta State University,Junior,Finance,...,"J.P Morgan Summer 2022, Atlanta office, Summer...",Yes,No,"J.P. Morgan, Middle-Market Bank, Atlanta offic...",No,,No,,"During the COVID-19 pandemic, my entire life c...",
203,203,Female,Jacksonville,FL,32221,Yes,Campus/Temporary,Florida A&M University,Graduate Student,Business Administration,...,Deloitte LLP - Houston,Yes,No,Deloitte LLP - Houston,No,,No,,The Covid-19 pandemic is something that has af...,Yes


## Remove duplicate rows

In [4]:
df[txt_col] = df[txt_col].astype(str)
df[txt_col] = df[txt_col].str.strip()

In [5]:
# check for duplicate paragraphs
n_rows = df.shape[0]
print(f"total number of applications: {n_rows}")
n_unq_paras = df[txt_col].drop_duplicates().shape[0]
print(f"total number of UNIQUE paragraphs: {n_unq_paras}")
has_no_duplicate_paras = n_rows == n_unq_paras
print(f"are there no duplicate paragraphs?: {has_no_duplicate_paras}")

total number of applications: 205
total number of UNIQUE paragraphs: 200
are there no duplicate paragraphs?: False


In [6]:
# get a list of duplicate paragraphs
duplicate_paras = df[txt_col].value_counts()[(df[txt_col].value_counts() > 1).values].index

In [7]:
# get app id of duplicate paragraphs to keep and delete (keep the entry with the least amount of empty values; drop the rest)
app_ids_to_drop = []
app_ids_to_keep = []
for a_duplicate_para in duplicate_paras.values:
    temp_df = df.set_index(txt_col).loc[a_duplicate_para].reset_index().copy()
    temp_df = temp_df.set_index(temp_df.isnull().sum(axis=1).reset_index()['index'].values)
    app_ids_to_drop.extend(temp_df['app_id'][1::].tolist())
    app_ids_to_keep.append(temp_df['app_id'][0])
    print(f"will drop: {app_ids_to_drop}")
    print(f"will keep: {app_ids_to_keep}")

will drop: [177]
will keep: [176]
will drop: [177, 175]
will keep: [176, 174]
will drop: [177, 175, 204]
will keep: [176, 174, 148]
will drop: [177, 175, 204, 134]
will keep: [176, 174, 148, 133]
will drop: [177, 175, 204, 134, 89]
will keep: [176, 174, 148, 133, 88]


In [8]:
# drop duplicate applications
col_order = df.columns # get initial order of columns

# create a dataframe of the rows to be dropped (because they are diplicates)
df_dups = df.copy() 
df_dups = df_dups[df_dups['app_id'].isin(app_ids_to_drop)].reset_index(drop=True)
df_dups = df_dups[col_order]

# update the dataframe to have duplicate rows removed
df = df[~df['app_id'].isin(app_ids_to_drop)].reset_index(drop=True)
df = df[col_order]

In [9]:
# check to see if duplication drop was successful 
n_rows = df.shape[0]
print(f"total number of applications: {n_rows}")
n_unq_paras = df[txt_col].drop_duplicates().shape[0]
print(f"total number of UNIQUE paragraphs: {n_unq_paras}")
has_no_duplicate_paras = n_rows == n_unq_paras
print(f"are there no duplicate paragraphs?: {has_no_duplicate_paras}")

total number of applications: 200
total number of UNIQUE paragraphs: 200
are there no duplicate paragraphs?: True


In [10]:
df_dups

Unnamed: 0,app_id,Gender:* Required fields are indicated with red symbol Permanent Contact Information,City (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,State (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,Zip (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,"Are you Black? (includes African, African American, Caribbean, etc.)",Preferred Mailing Address,College/University:Academic Profile,Classification (as of January 2022):Academic Profile,Major:Academic Profile,...,"Please provide details (i.e. company name, location, etc.)",I have accepted an internship for the summer of 2022 (June - August),I have accepted an internship for the fall of 2022 (September- December),"Please provide details (i.e. company name, location, etc.).1",I have accepted a permanent job offer,"Please provide details (i.e. company name, location, etc.).2",Have you received a CPA Exam Review?,Which CPA Exam Review have you received?,Copy/ PasteÂ An Essay Response of 500 words or less (copy and paste in webform) using the following prompt: Community disruptions such as Covid-19 and other natural disasters can have deep lasting impacts. Discuss a challenge or barrier you have overcome during the Covid-19 pandemic.,Recipient
0,89,Female,Hockessin,DE,19707.0,Yes,Campus/Temporary,Goldey-Beacom College,Junior,Accounting,...,"WSFS Bank Summer Intern, PwC Start Intern",Yes,No,PwC Start Program Summer Intern Philadelphia L...,No,,No,,The challenges I had to overcome during the Co...,
1,134,Female,WALTHAM,MA,2453.0,Yes,Permanent,Bentley University,Graduate Student,Masters of Business Administration (MBA),...,"Spotify (Boston), PayPal (Boston), Zoom(Multip...",No,No,,No,,No,,"""Mirabel seemed to have life figured out."", ""A...",
2,175,Female,beckley,WV,25801.0,Yes,Campus/Temporary,Bluffton University,Junior,Accounting,...,Ernst and Young LLC,Yes,No,,No,,No,,Covid-19 gave the world an x-ray of the inequa...,
3,177,Male,Alexandria,VA,22309.0,Yes,Permanent,George Mason University,Senior,Accounting,...,PwC and KPMG,,,,No,,Yes,Surgent CPA Review and Becker CPA Review,My family was affected by the COVID pandemic. ...,
4,204,,,,,,,,,,...,,,,,,,,,,


In [11]:
# saved rows that were dropped
df_dups.to_csv(dropped_filename, index=False)

In [12]:
df

Unnamed: 0,app_id,Gender:* Required fields are indicated with red symbol Permanent Contact Information,City (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,State (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,Zip (Permanent):* Required fields are indicated with red symbol Permanent Contact Information,"Are you Black? (includes African, African American, Caribbean, etc.)",Preferred Mailing Address,College/University:Academic Profile,Classification (as of January 2022):Academic Profile,Major:Academic Profile,...,"Please provide details (i.e. company name, location, etc.)",I have accepted an internship for the summer of 2022 (June - August),I have accepted an internship for the fall of 2022 (September- December),"Please provide details (i.e. company name, location, etc.).1",I have accepted a permanent job offer,"Please provide details (i.e. company name, location, etc.).2",Have you received a CPA Exam Review?,Which CPA Exam Review have you received?,Copy/ PasteÂ An Essay Response of 500 words or less (copy and paste in webform) using the following prompt: Community disruptions such as Covid-19 and other natural disasters can have deep lasting impacts. Discuss a challenge or barrier you have overcome during the Covid-19 pandemic.,Recipient
0,0,Male,Brooklyn,NY,11216,Yes,Permanent,Medgar Evers College,Senior,Accounting,...,,,,,No,,No,,During the early parts of 2020 one of the dead...,
1,1,Female,Bowie,MD,20720,Yes,Permanent,University of Maryland,Freshman,Business Management,...,Received internship as a Summer 2022 Discovery...,Yes,No,,No,,No,,One of the most challenging times has been dur...,
2,2,Female,Chicago Heights,IL,60411,Yes,Permanent,North Carolina A&T State University,Freshman,Accounting,...,,No,No,,No,,No,,When covid restrictions took place I was just ...,
3,3,Female,Chicago,IL,60620,Yes,Permanent,Loyola University Chicago,Sophomore,Accounting,...,"Ernst & Young, Chicago, Summer 2022",Yes,No,"Ernst & Young, Chicago, Summer 2022",No,,No,,"When the Covid-19 pandemic first began, I was ...",
4,4,Male,Baton Rouge,LA,70806,Yes,Campus/Temporary,Penn State University,Sophomore,Accounting,...,,,,,No,,No,,Community disruptions such as Covid-19 and oth...,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,199,Male,Middletown,DE,19709,Yes,Permanent,West Chester University,Graduate Student,Masters of Business Administration,...,"Ernst and Young, Philadelphia, Pa, Financial C...",Yes,No,"Ernst and Young, Philadelphia, Pa, Financial C...",No,,Yes,Becker,There are many issues that are impacting the a...,Yes
196,200,Female,Brooklyn,NY,11210,Yes,Permanent,"University at Albany, SUNY",Junior,Business Administration,...,"JPMorgan Chase & Co., NYC",Yes,No,"JPMorgan Chase & Co., NYC",No,,No,,"My name is Wunmi Surakat, a current 1st semest...",
197,201,Female,Brooklyn,NY,11212,Yes,Permanent,Medgar Evers College,Junior,Accounting,...,,,,,No,,No,,No one expected the Covid- 19 pandemic. Since ...,
198,202,Female,Valdosta,GA,31601,Yes,Permanent,Valdosta State University,Junior,Finance,...,"J.P Morgan Summer 2022, Atlanta office, Summer...",Yes,No,"J.P. Morgan, Middle-Market Bank, Atlanta offic...",No,,No,,"During the COVID-19 pandemic, my entire life c...",


In [13]:
# save removed duplicates file
df.to_csv(removed_duplicates_filename, index=False)