# This notebook is scraping data from:  
https://www.kaggle.com/datasets/jpmiller/employee-attrition-for-healthcare?resource=download

In [1]:
#Import necessities 
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager

In [2]:
# Initialize the Selenium web driver (Chrome in this case)
browser = webdriver.Chrome(service=Service(ChromeDriverManager().install()))

# scrape info website
url = "https://pmc.ncbi.nlm.nih.gov/articles/PMC10742910/#healthcare-11-03173-t001"
browser.get(url)
browser.maximize_window()

In [3]:
# Pause for 10 seconds to allow the page to load
time.sleep(10)

In [4]:
# Wait for the page to load and locate the table
table = browser.find_element(By.XPATH, '//*[@id="healthcare-11-03173-t001"]')

# Extract headers
headers = [header.text for header in table.find_elements(By.TAG_NAME, 'th')]

# Extract rows
rows = []
for row in table.find_elements(By.TAG_NAME, 'tr'):
    cells = [cell.text for cell in row.find_elements(By.TAG_NAME, 'td')]
    if cells:  # Avoid empty rows
        rows.append(cells)

# Create DataFrame
df = pd.DataFrame(rows, columns=headers)

# Display the DataFrame
display(df)




Unnamed: 0,Feature Name,Data Type,Description
0,Turnover (Dependent Variable),Categorical,Outcome feature: showing whether the nurse lef...
1,Certificate,Categorical,Type of active certification (three-factor lev...
2,Region,Categorical,Location of primary nursing position-census di...
3,Job_Satisfaction,Categorical,Levels of job satisfaction in primary nursing ...
4,Race,Categorical,Race (White vs. other race (Black or African A...
5,Sex,Categorical,Sex (Male vs. Female)
6,Marital_Status,Categorical,"Marital Status (Single vs. Married): widow, di..."
7,Veteran,Categorical,Veteran Status (Served vs. Never served): acti...
8,Household_Income,Categorical,Pre-tax annual household income (three-factor ...
9,Degree,Categorical,Type of nursing degree: three-factor levels (A...


In [5]:
# Locate Table 3
table = browser.find_element(By.XPATH, '//*[@id="healthcare-11-03173-t003"]')

# Extract headers
headers = [header.text for header in table.find_elements(By.TAG_NAME, 'th')][:5]
print("Headers:", headers)  # Debug: Check headers

# Extract rows and keep only columns 0 to 4
rows = []
for row in table.find_elements(By.TAG_NAME, 'tr'):
    cells = [cell.text for cell in row.find_elements(By.TAG_NAME, 'td')][:5]
    if cells:  # Avoid empty rows
        print("Row:", cells)  # Debug: Check rows
        rows.append(cells)

# Ensure the number of headers matches the number of columns in rows
if len(headers) == len(rows[0]):
    df = pd.DataFrame(rows, columns=headers)
else:
    df = pd.DataFrame(rows)  # Create DataFrame without headers if mismatch

# Display the DataFrame
print(df)


Headers: ['Original Data', 'SMOTE', 'Characteristic', 'Turnover', 'Turnover']
Row: ['Certificate', '', '', '', '']
Row: [' Other', '443', '9.37%', '2748', '7.01%']
Row: [' NP', '2173', '45.96%', '19,382', '49.43%']
Row: [' RN', '2112', '44.67%', '17,079', '43.56%']
Row: ['Region', '', '', '', '']
Row: [' Midwest', '1059', '22.40%', '8950', '22.83%']
Row: [' North', '893', '18.89%', '7227', '18.43%']
Row: [' South', '1574', '33.29%', '13,084', '33.37%']
Row: [' West', '1202', '25.42%', '9948', '25.37%']
Row: ['Job_Satisfaction', '', '', '', '']
Row: [' Dissatisfied', '462', '9.77%', '3867', '9.86%']
Row: [' Satisfied', '4266', '90.23%', '35,342', '90.14%']
Row: ['Race', '', '', '', '']
Row: [' Other Race', '638', '13.49%', '5686', '14.50%']
Row: [' White', '4090', '86.51%', '33,523', '85.50%']
Row: ['Sex', '', '', '', '']
Row: [' Female', '4307', '91.10%', '35,847', '91.43%']
Row: [' Male', '421', '8.90%', '3362', '8.57%']
Row: ['Marital Status', '', '', '', '']
Row: [' Married', '3548'

In [6]:
# Locate Table 3
table = browser.find_element(By.XPATH, '//*[@id="healthcare-11-03173-t003"]')

# Extract headers
headers = [header.text for header in table.find_elements(By.TAG_NAME, 'th')][:5]
print("Headers:", headers)  # Debug: Check headers

# Custom headers
custom_headers = ["Attrition", "Yes/Count", "Yes/Percentage", "No/Count", "No/Percentage"]

# Extract rows and keep only columns 0 to 4
rows = []
for row in table.find_elements(By.TAG_NAME, 'tr'):
    cells = [cell.text for cell in row.find_elements(By.TAG_NAME, 'td')][:5]
    if cells:  # Avoid empty rows
        print("Row:", cells)  # Debug: Check rows
        rows.append(cells)

# Ensure the number of headers matches the number of columns in rows
if len(custom_headers) == len(rows[0]):
    df = pd.DataFrame(rows, columns=custom_headers)
else:
    df = pd.DataFrame(rows, columns=custom_headers[:len(rows[0])])  # Adjust columns if mismatch

# Display the DataFrame


Headers: ['Original Data', 'SMOTE', 'Characteristic', 'Turnover', 'Turnover']
Row: ['Certificate', '', '', '', '']
Row: [' Other', '443', '9.37%', '2748', '7.01%']
Row: [' NP', '2173', '45.96%', '19,382', '49.43%']
Row: [' RN', '2112', '44.67%', '17,079', '43.56%']
Row: ['Region', '', '', '', '']
Row: [' Midwest', '1059', '22.40%', '8950', '22.83%']
Row: [' North', '893', '18.89%', '7227', '18.43%']
Row: [' South', '1574', '33.29%', '13,084', '33.37%']
Row: [' West', '1202', '25.42%', '9948', '25.37%']
Row: ['Job_Satisfaction', '', '', '', '']
Row: [' Dissatisfied', '462', '9.77%', '3867', '9.86%']
Row: [' Satisfied', '4266', '90.23%', '35,342', '90.14%']
Row: ['Race', '', '', '', '']
Row: [' Other Race', '638', '13.49%', '5686', '14.50%']
Row: [' White', '4090', '86.51%', '33,523', '85.50%']
Row: ['Sex', '', '', '', '']
Row: [' Female', '4307', '91.10%', '35,847', '91.43%']
Row: [' Male', '421', '8.90%', '3362', '8.57%']
Row: ['Marital Status', '', '', '', '']
Row: [' Married', '3548'

In [67]:
display(df.head(5))

Unnamed: 0,Attrition,Yes/Count,Yes/Percentage,No/Count,No/Percentage
0,Certificate,,,,
1,Other,443.0,9.37%,2748.0,7.01%
2,NP,2173.0,45.96%,19382.0,49.43%
3,RN,2112.0,44.67%,17079.0,43.56%
4,Region,,,,


In [8]:
# Transpose the DataFrame
transposed_df = df.transpose()

# Resetting the index so that the transposed DataFrame has a clean column structure
transposed_df.reset_index(drop=True, inplace=True)

# Display the transposed DataFrame
display(transposed_df)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,48,49,50,51,52,53,54,55,56,57
0,Certificate,Other,NP,RN,Region,Midwest,North,South,West,Job_Satisfaction,...,Inpatient/Other,Practice,No,Yes,Working Hour,Standard,Overtime,Numerical Variables,Age,Individual Income
1,,443,2173,2112,,1059,893,1574,1202,,...,1062,,1003,3725,,3197,1531,Average,55,70285
2,,9.37%,45.96%,44.67%,,22.40%,18.89%,33.29%,25.42%,,...,22.46%,,21.21%,78.79%,,67.62%,32.38%,Std.dev,11,41404
3,,2748,19382,17079,,8950,7227,13084,9948,,...,8548,,8512,30697,,27153,12056,Average,48,85444
4,,7.01%,49.43%,43.56%,,22.83%,18.43%,33.37%,25.37%,,...,21.80%,,21.71%,78.29%,,69.25%,30.75%,Std.dev,12,37157


In [9]:
# Assuming the first two rows contain information for columns
transposed_df.columns = transposed_df.iloc[0]  # Set the first row as the header
transposed_df = transposed_df[1:].reset_index(drop=True)  # Drop the original header row and reset index

# Display the DataFrame with updated column names
display(transposed_df.head())


Unnamed: 0,Certificate,Other,NP,RN,Region,Midwest,North,South,West,Job_Satisfaction,...,Inpatient/Other,Practice,No,Yes,Working Hour,Standard,Overtime,Numerical Variables,Age,Individual Income
0,,443,2173,2112,,1059,893,1574,1202,,...,1062,,1003,3725,,3197,1531,Average,55,70285
1,,9.37%,45.96%,44.67%,,22.40%,18.89%,33.29%,25.42%,,...,22.46%,,21.21%,78.79%,,67.62%,32.38%,Std.dev,11,41404
2,,2748,19382,17079,,8950,7227,13084,9948,,...,8548,,8512,30697,,27153,12056,Average,48,85444
3,,7.01%,49.43%,43.56%,,22.83%,18.43%,33.37%,25.37%,,...,21.80%,,21.71%,78.29%,,69.25%,30.75%,Std.dev,12,37157


In [10]:
transposed_df.to_csv("raw_webscraped.csv", index=False, encoding="utf-8")

In [11]:
import pandas as pd

# Preview the file to check its structure
webscraped_df = pd.read_csv('raw_webscraped.csv') 
display(webscraped_df.head(10))


Unnamed: 0,Certificate,Other,NP,RN,Region,Midwest,North,South,West,Job_Satisfaction,...,Inpatient/Other,Practice,No.2,Yes.2,Working Hour,Standard,Overtime,Numerical Variables,Age,Individual Income
0,,443,2173,2112,,1059,893,1574,1202,,...,1062,,1003,3725,,3197,1531,Average,55,70285
1,,9.37%,45.96%,44.67%,,22.40%,18.89%,33.29%,25.42%,,...,22.46%,,21.21%,78.79%,,67.62%,32.38%,Std.dev,11,41404
2,,2748,19382,17079,,8950,7227,13084,9948,,...,8548,,8512,30697,,27153,12056,Average,48,85444
3,,7.01%,49.43%,43.56%,,22.83%,18.43%,33.37%,25.37%,,...,21.80%,,21.71%,78.29%,,69.25%,30.75%,Std.dev,12,37157


In [12]:
# Only keeping origial data and updating index names
# Assuming df is your original DataFrame

# Drop indexes 4-7 from the transposed DataFrame
webscraped_df = webscraped_df.drop(webscraped_df.index[4:8])

# Rename the remaining indexes
index_labels = ["Yes/Count", "Yes/%", "No/Count", "No/%"]
webscraped_df.index = index_labels

# Display the updated DataFrame
print("Updated webscraped DataFrame:")
display(webscraped_df)

Updated webscraped DataFrame:


Unnamed: 0,Certificate,Other,NP,RN,Region,Midwest,North,South,West,Job_Satisfaction,...,Inpatient/Other,Practice,No.2,Yes.2,Working Hour,Standard,Overtime,Numerical Variables,Age,Individual Income
Yes/Count,,443,2173,2112,,1059,893,1574,1202,,...,1062,,1003,3725,,3197,1531,Average,55,70285
Yes/%,,9.37%,45.96%,44.67%,,22.40%,18.89%,33.29%,25.42%,,...,22.46%,,21.21%,78.79%,,67.62%,32.38%,Std.dev,11,41404
No/Count,,2748,19382,17079,,8950,7227,13084,9948,,...,8548,,8512,30697,,27153,12056,Average,48,85444
No/%,,7.01%,49.43%,43.56%,,22.83%,18.43%,33.37%,25.37%,,...,21.80%,,21.71%,78.29%,,69.25%,30.75%,Std.dev,12,37157


In [13]:
# Remove spaces and convert to lowercase (if needed)
webscraped_df.columns = webscraped_df.columns.str.strip().str.lower()
# Print the current column names to verify them
print(webscraped_df.columns)

Index(['certificate', 'other', 'np', 'rn', 'region', 'midwest', 'north',
       'south', 'west', 'job_satisfaction', 'dissatisfied', 'satisfied',
       'race', 'other race', 'white', 'sex', 'female', 'male',
       'marital status', 'married', 'single', 'veteran', 'never served',
       'served', 'household_income', 'less than $75,000',
       '$75,001 to $150,000', 'more than $150,001', 'degree', 'adn', 'bsn',
       'msn', 'phd/dnp/dn', 'dependant < 6 years', 'no', 'yes',
       'ehr_emr usability', 'no.1', 'yes.1', 'employment_type',
       'employed by organization', 'other.1', 'job_type', 'full time',
       'part time', 'employment_setting', 'clinical/ambulatory', 'hospital',
       'inpatient/other', 'practice', 'no.2', 'yes.2', 'working hour',
       'standard', 'overtime', 'numerical variables', 'age',
       'individual income'],
      dtype='object')


In [14]:
# First, normalize column names to avoid issues with spaces and cases
webscraped_df.columns = webscraped_df.columns.str.strip().str.lower()

# Dictionary for specific columns to rename
rename_dict = {
    'other': 'certificate_other', 
    'np': 'certificate_np', 
    'rn': 'certificate_rn', 
    'dissatisfied': 'job_dissatisfied', 
    'satisfied': 'job_satisfied',
    'no.2': 'practice_no.2',
    'yes.2': 'practice_yes.2'
}

# Update column names based on the specific dictionary
webscraped_df.rename(columns=rename_dict, inplace=True)

# Automatically prefix 'degree_' to relevant columns
webscraped_df.columns = [
    f"degree_{col}" if col in ['adn', 'bsn', 'msn', 'phd/dnp/dn'] else col 
    for col in webscraped_df.columns
]

# Print the updated columns to verify the changes
print("Updated Columns:", webscraped_df.columns)
display(webscraped_df)

Updated Columns: Index(['certificate', 'certificate_other', 'certificate_np', 'certificate_rn',
       'region', 'midwest', 'north', 'south', 'west', 'job_satisfaction',
       'job_dissatisfied', 'job_satisfied', 'race', 'other race', 'white',
       'sex', 'female', 'male', 'marital status', 'married', 'single',
       'veteran', 'never served', 'served', 'household_income',
       'less than $75,000', '$75,001 to $150,000', 'more than $150,001',
       'degree', 'degree_adn', 'degree_bsn', 'degree_msn', 'degree_phd/dnp/dn',
       'dependant < 6 years', 'no', 'yes', 'ehr_emr usability', 'no.1',
       'yes.1', 'employment_type', 'employed by organization', 'other.1',
       'job_type', 'full time', 'part time', 'employment_setting',
       'clinical/ambulatory', 'hospital', 'inpatient/other', 'practice',
       'practice_no.2', 'practice_yes.2', 'working hour', 'standard',
       'overtime', 'numerical variables', 'age', 'individual income'],
      dtype='object')


Unnamed: 0,certificate,certificate_other,certificate_np,certificate_rn,region,midwest,north,south,west,job_satisfaction,...,inpatient/other,practice,practice_no.2,practice_yes.2,working hour,standard,overtime,numerical variables,age,individual income
Yes/Count,,443,2173,2112,,1059,893,1574,1202,,...,1062,,1003,3725,,3197,1531,Average,55,70285
Yes/%,,9.37%,45.96%,44.67%,,22.40%,18.89%,33.29%,25.42%,,...,22.46%,,21.21%,78.79%,,67.62%,32.38%,Std.dev,11,41404
No/Count,,2748,19382,17079,,8950,7227,13084,9948,,...,8548,,8512,30697,,27153,12056,Average,48,85444
No/%,,7.01%,49.43%,43.56%,,22.83%,18.43%,33.37%,25.37%,,...,21.80%,,21.71%,78.29%,,69.25%,30.75%,Std.dev,12,37157


In [15]:
# Drop columns that contain any NaN values
webscraped_df.dropna(axis=1, inplace=True)
# Print the updated columns to verify the changes
print("Updated Columns:", webscraped_df.columns)
display(webscraped_df)

Updated Columns: Index(['certificate_other', 'certificate_np', 'certificate_rn', 'midwest',
       'north', 'south', 'west', 'job_dissatisfied', 'job_satisfied',
       'other race', 'white', 'female', 'male', 'married', 'single',
       'never served', 'served', 'less than $75,000', '$75,001 to $150,000',
       'more than $150,001', 'degree_adn', 'degree_bsn', 'degree_msn',
       'degree_phd/dnp/dn', 'no', 'yes', 'no.1', 'yes.1',
       'employed by organization', 'other.1', 'full time', 'part time',
       'clinical/ambulatory', 'hospital', 'inpatient/other', 'practice_no.2',
       'practice_yes.2', 'standard', 'overtime', 'numerical variables', 'age',
       'individual income'],
      dtype='object')


Unnamed: 0,certificate_other,certificate_np,certificate_rn,midwest,north,south,west,job_dissatisfied,job_satisfied,other race,...,clinical/ambulatory,hospital,inpatient/other,practice_no.2,practice_yes.2,standard,overtime,numerical variables,age,individual income
Yes/Count,443,2173,2112,1059,893,1574,1202,462,4266,638,...,1608,2058,1062,1003,3725,3197,1531,Average,55,70285
Yes/%,9.37%,45.96%,44.67%,22.40%,18.89%,33.29%,25.42%,9.77%,90.23%,13.49%,...,34.01%,43.53%,22.46%,21.21%,78.79%,67.62%,32.38%,Std.dev,11,41404
No/Count,2748,19382,17079,8950,7227,13084,9948,3867,35342,5686,...,13110,17551,8548,8512,30697,27153,12056,Average,48,85444
No/%,7.01%,49.43%,43.56%,22.83%,18.43%,33.37%,25.37%,9.86%,90.14%,14.50%,...,33.44%,44.76%,21.80%,21.71%,78.29%,69.25%,30.75%,Std.dev,12,37157


In [16]:
webscraped_df.dtypes

certificate_other           object
certificate_np              object
certificate_rn              object
midwest                     object
north                       object
south                       object
west                        object
job_dissatisfied            object
job_satisfied               object
other race                  object
white                       object
female                      object
male                        object
married                     object
single                      object
never served                object
served                      object
less than $75,000           object
$75,001 to $150,000         object
more than $150,001          object
degree_adn                  object
degree_bsn                  object
degree_msn                  object
degree_phd/dnp/dn           object
no                          object
yes                         object
no.1                        object
yes.1                       object
employed by organiza

In [17]:
# Function to convert percentage strings to decimals
def percent_to_decimal(value):
    if isinstance(value, str) and '%' in value:
        return float(value.replace('%', '')) / 100
    return value

# Apply the function to every cell using .map() on each column
webscraped_df = webscraped_df.apply(lambda col: col.map(percent_to_decimal))

display(webscraped_df)

Unnamed: 0,certificate_other,certificate_np,certificate_rn,midwest,north,south,west,job_dissatisfied,job_satisfied,other race,...,clinical/ambulatory,hospital,inpatient/other,practice_no.2,practice_yes.2,standard,overtime,numerical variables,age,individual income
Yes/Count,443.0,2173.0,2112.0,1059.0,893.0,1574.0,1202.0,462.0,4266.0,638.0,...,1608.0,2058.0,1062.0,1003.0,3725.0,3197.0,1531.0,Average,55,70285
Yes/%,0.0937,0.4596,0.4467,0.224,0.1889,0.3329,0.2542,0.0977,0.9023,0.1349,...,0.3401,0.4353,0.2246,0.2121,0.7879,0.6762,0.3238,Std.dev,11,41404
No/Count,2748.0,19382.0,17079.0,8950.0,7227.0,13084.0,9948.0,3867.0,35342.0,5686.0,...,13110.0,17551.0,8548.0,8512.0,30697.0,27153.0,12056.0,Average,48,85444
No/%,0.0701,0.4943,0.4356,0.2283,0.1843,0.3337,0.2537,0.0986,0.9014,0.145,...,0.3344,0.4476,0.218,0.2171,0.7829,0.6925,0.3075,Std.dev,12,37157


In [18]:
# Drop columns and rename them to match other dataframe
columns_to_keep = ['age', 'job_dissatisfied', 'job_satisfied', 'female', 'male', 'married', 'single', 'overtime', 'individual income']
# Drop all other columns
webscraped_df = webscraped_df[columns_to_keep]

display(webscraped_df)

Unnamed: 0,age,job_dissatisfied,job_satisfied,female,male,married,single,overtime,individual income
Yes/Count,55,462.0,4266.0,4307.0,421.0,3548.0,1180.0,1531.0,70285
Yes/%,11,0.0977,0.9023,0.911,0.089,0.7504,0.2496,0.3238,41404
No/Count,48,3867.0,35342.0,35847.0,3362.0,29490.0,9719.0,12056.0,85444
No/%,12,0.0986,0.9014,0.9143,0.0857,0.7521,0.2479,0.3075,37157


In [19]:
webscraped_df.dtypes

age                   int64
job_dissatisfied     object
job_satisfied        object
female               object
male                 object
married              object
single               object
overtime             object
individual income    object
dtype: object

In [20]:
webscraped_df.to_csv("clean_webscraped.csv", index=False, encoding="utf-8")