In [18]:
import pandas as pd
import os

# Data Recruitment

## Ingesting Data

In [19]:
# Creating filepath in a variable
req_path = 'data/data_reqruitment/data_requirements.csv'

# Reading csv file
df_recruitments_orignal = pd.read_csv(req_path)

# Creating copy to not edit the original
df_recruitments = df_recruitments_orignal.copy()
df_recruitments.drop(columns='Unnamed: 0', inplace=True)
df_recruitments.head()

Unnamed: 0,company,company_rating,location,job_title,job_description,salary_estimate,company_size,company_type,company_sector,company_industry,company_founded,company_revenue,dates
0,PCS Global Tech\n4.7,4.7,"Riverside, CA",Data Engineer | PAID BOOTCAMP,Responsibilities\n· Analyze and organize raw d...,"$70,000 /yr (est.)",501 to 1000 Employees,Company - Private,Information Technology,Information Technology Support Services,,Unknown / Non-Applicable,2024-06-12 00:00:00-10:00
1,Futuretech Consultants LLC,,"Newton, MS",Snowflake Data Engineer,My name is Dileep and I am a recruiter at Futu...,$42.50 /hr (est.),,,,,,,2024-06-12 00:00:00+07:00
2,Clairvoyant\n4.4,4.4,Remote,Data Engineer (MDM),Required Skills:\nMust have 5-8+ Years of expe...,$67.50 /hr (est.),51 to 200 Employees,Company - Private,Pharmaceutical & Biotechnology,Biotech & Pharmaceuticals,,Unknown / Non-Applicable,2024-06-12 00:00:00-10:00
3,Apple\n4.2,4.2,"Cupertino, CA",Data Engineer,"Summary\nPosted: Dec 22, 2021\nWeekly Hours: 4...",,10000+ Employees,Company - Public,Information Technology,Computer Hardware Development,1976.0,$10+ billion (USD),2024-06-12 00:00:00-05:00
4,Skytech Consultancy Services\n5.0,5.0,"Baltimore, MD",Data Engineer,Description of Work:\nTechnical experience in ...,$65.00 /hr (est.),1 to 50 Employees,Company - Public,,,,Unknown / Non-Applicable,2024-06-12 00:00:00-04:00


## Cleaning & Transforming Data

In [20]:
# Looking how the data shape (null count, data types, columns)
df_recruitments.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1555 entries, 0 to 1554
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   company           1551 non-null   object 
 1   company_rating    1358 non-null   float64
 2   location          1554 non-null   object 
 3   job_title         1554 non-null   object 
 4   job_description   1554 non-null   object 
 5   salary_estimate   1277 non-null   object 
 6   company_size      1442 non-null   object 
 7   company_type      1442 non-null   object 
 8   company_sector    1260 non-null   object 
 9   company_industry  1260 non-null   object 
 10  company_founded   1131 non-null   float64
 11  company_revenue   1442 non-null   object 
 12  dates             1555 non-null   object 
dtypes: float64(2), object(11)
memory usage: 158.1+ KB


In [21]:
# Ada beberapa error dalam kolom 'company' yang memasukan 'company rating'

# Regex untuk mendapatkan tiap elemen selain '\n'
clenased_company = df_recruitments['company'].str.findall(r'[a-zA-z0-9 ]+')

# Mengambil nama 'company' dengan asumsi index 0
def getFirstItem(item):
    try:
        fullName = str(item[0])
        return fullName
    except:
        return None

# apply dan menggantikannya kepada kolom 'company'
df_recruitments['company'] = clenased_company.apply(getFirstItem).astype('string')

In [22]:
# creating a 'numeric_salary' column for analysis
df_recruitments['numeric_salary'] = df_recruitments['salary_estimate'].str.replace(',', '').str.findall('[0-9.]+').apply(getFirstItem).astype(float)

In [23]:
# creating a 'salary_type' column to get categorical values
slrType = df_recruitments['salary_estimate'].str.findall('/[a-zA-Z]+')

def salaryType(salType):
    salary = getFirstItem(salType)

    if salary == '/yr':
        return 'yearly'
    elif salary == '/hr':
        return 'hourly'
    elif salary == '/mo':
        return 'monthly'
    else:
        return None

df_recruitments['salary_type'] = slrType.apply(salaryType).astype('string')

In [24]:
# creating a 'monthly_salary' column for easier salary comparison in analysis

def getMonthlySalary(x):
    try:
        if x['salary_type'] == 'yearly':
            salary = x['numeric_salary']/12
            salary = round(salary, 2)
            return salary
        elif x['salary_type'] == 'hourly':
            salary = x['numeric_salary'] * 173.2
            salary = round(salary, 2)
            return salary
        elif x['salary_type'] == 'monthly':
            return x
        else:
            return None
    except:
        return None


df_recruitments['monthly_salary'] = df_recruitments.apply(getMonthlySalary, axis=1)

In [25]:
# Drop values yang tidak memiliki kolom 'company'

df_recruitments.dropna(subset='company', inplace=True)

In [26]:
# Menjadikan data menjadi UTC

df_recruitments['dates'] = pd.to_datetime(df_recruitments['dates'], utc=True)

## Data Demography

Dari data yang kita sudah bersihkan diatas, kita menemukan beberapa kolom categorical yang kita bisa analisis dengan grouping yaitu `company_type`, `company_sector`, `company_industry`. Lalu kita juga telah membersihkan kolom `company_rating` dan membentuk `monthly_salary` untuk values yang kita bisa lihat dari analisis.

In [27]:
# Melihat bentuk data secara general

df_recruitments.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1551 entries, 0 to 1554
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype              
---  ------            --------------  -----              
 0   company           1551 non-null   string             
 1   company_rating    1358 non-null   float64            
 2   location          1551 non-null   object             
 3   job_title         1551 non-null   object             
 4   job_description   1551 non-null   object             
 5   salary_estimate   1274 non-null   object             
 6   company_size      1439 non-null   object             
 7   company_type      1439 non-null   object             
 8   company_sector    1257 non-null   object             
 9   company_industry  1257 non-null   object             
 10  company_founded   1128 non-null   float64            
 11  company_revenue   1439 non-null   object             
 12  dates             1551 non-null   datetime64[ns, UTC]
 13  numeric_

In [28]:
# Menunjukan masing-masing columns
df_recruitments.columns

Index(['company', 'company_rating', 'location', 'job_title', 'job_description',
       'salary_estimate', 'company_size', 'company_type', 'company_sector',
       'company_industry', 'company_founded', 'company_revenue', 'dates',
       'numeric_salary', 'salary_type', 'monthly_salary'],
      dtype='object')

Mencari unique values untuk masing-masing categorical variables

In [29]:
df_recruitments['company_type'].unique()

array(['Company - Private', nan, 'Company - Public', 'Hospital',
       'Contract', 'Subsidiary or Business Segment',
       'Nonprofit Organization', 'Private Practice / Firm', 'Government',
       'Self-employed', 'College / University', 'Unknown',
       'School / School District'], dtype=object)

In [30]:
df_recruitments['company_sector'].unique()

array(['Information Technology', nan, 'Pharmaceutical & Biotechnology',
       'Management & Consulting', 'Healthcare',
       'Human Resources & Staffing', 'Manufacturing',
       'Financial Services', 'Hotels & Travel Accommodation',
       'Transportation & Logistics', 'Retail & Wholesale',
       'Arts, Entertainment & Recreation',
       'Government & Public Administration', 'Aerospace & Defense',
       'Personal Consumer Services', 'Media & Communication',
       'Energy, Mining & Utilities', 'Insurance',
       'Construction, Repair & Maintenance Services', 'Agriculture',
       'Education', 'Real Estate', 'Legal', 'Restaurants & Food Service',
       'Nonprofit & NGO', 'Telecommunications'], dtype=object)

In [31]:
print(df_recruitments['company_industry'].nunique())
# memiliki 83 unique values

df_recruitments['company_industry'].unique()

83


array(['Information Technology Support Services', nan,
       'Biotech & Pharmaceuticals', 'Computer Hardware Development',
       'Research & Development', 'Health Care Services & Hospitals',
       'Internet & Web Services', 'HR Consulting',
       'Consumer Product Manufacturing',
       'Enterprise Software & Network Solutions',
       'Investment & Asset Management', 'Accounting & Tax',
       'Business Consulting', 'Financial Transaction Processing',
       'Staffing & Subcontracting', 'Banking & Lending',
       'Travel Agencies', 'Airlines, Airports & Air Transportation',
       'Automotive Parts & Accessories Stores', 'Sports & Recreation',
       'National Agencies', 'Aerospace & Defense', 'Software Development',
       'Beauty & Wellness', 'Publishing', 'Food & Beverage Manufacturing',
       'Energy & Utilities', 'Advertising & Public Relations',
       'Insurance Carriers', 'Wholesale',
       'Department, Clothing & Shoe Stores', 'Machinery Manufacturing',
       'Archite

Mencari rata-rata untuk Number variables. (int & float)

In [32]:
# Untuk company rating
print(df_recruitments['company_rating'].mean())

3.925625920471281


In [33]:
# Max dan Mean untuk salary
#print(df_recruitments['monthly_salary'].max())
#$print(df_recruitments['monthly_salary'].mean())

menggunaka describe untuk analisis kolom secara general.

In [34]:
# Untuk analisis kolom numerical
df_recruitments.describe()

Unnamed: 0,company_rating,company_founded,numeric_salary
count,1358.0,1128.0,1274.0
mean,3.925626,1975.929078,98180.690424
std,0.535826,51.530079,50629.066214
min,1.0,1636.0,22.5
25%,3.6,1964.75,84366.0
50%,3.9,1997.0,102802.0
75%,4.2,2009.0,125000.0
max,5.0,2022.0,341000.0


In [35]:
# Untuk analisis kolom dengan datatype object
df_recruitments.describe(include=object)

Unnamed: 0,location,job_title,job_description,salary_estimate,company_size,company_type,company_sector,company_industry,company_revenue,monthly_salary
count,1551,1551,1551,1274,1439,1439,1257,1257,1439,1274.0
unique,418,780,1549,953,8,12,25,83,10,954.0
top,Remote,Data Engineer,Looking for a Data Logging Engineer\nWhat you ...,"$120,000 /yr (est.)",10000+ Employees,Company - Private,Information Technology,Information Technology Support Services,Unknown / Non-Applicable,10000.0
freq,234,356,2,17,433,726,507,215,551,17.0


### Company Rating 
Melakukan agregasi nilai rating, dengan mengroup kolom kategorical kita 'company_type', 'company_industry', dan 'company_sector'.

In [36]:
df_recruitments.groupby('company_type')['company_rating'].agg(['mean', 'min', 'max', 'count']).sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,mean,min,max,count
company_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
College / University,4.193333,3.9,4.5,15
Private Practice / Firm,4.03,3.4,4.5,10
Unknown,4.0,3.1,4.6,8
Company - Private,3.988143,2.0,5.0,700
Company - Public,3.891813,1.3,5.0,513
Nonprofit Organization,3.729545,2.2,4.6,44
Subsidiary or Business Segment,3.721875,2.9,4.7,32
School / School District,3.7,3.7,3.7,1
Contract,3.666667,1.0,5.0,12
Government,3.48,2.9,4.1,15


In [37]:
df_recruitments.groupby('company_sector')['company_rating'].agg(['mean', 'min', 'max', 'count']).sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,mean,min,max,count
company_sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Human Resources & Staffing,4.192857,3.4,5.0,28
Hotels & Travel Accommodation,4.075,3.3,4.5,4
Information Technology,4.05,1.3,5.0,500
"Arts, Entertainment & Recreation",4.046154,3.1,5.0,13
Aerospace & Defense,3.997674,3.5,4.8,43
Education,3.921429,3.0,4.5,28
Real Estate,3.91,3.5,4.7,10
Pharmaceutical & Biotechnology,3.85,2.8,4.6,28
Management & Consulting,3.85,2.4,4.8,72
Financial Services,3.847458,2.6,5.0,118


In [38]:
df_recruitments.groupby(['company_sector','company_industry'])['company_rating'].agg(['mean', 'min', 'max', 'count']).sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min,max,count
company_sector,company_industry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Retail & Wholesale,Consumer Electronics & Appliances Stores,4.500000,4.5,4.5,1
Retail & Wholesale,Automotive Parts & Accessories Stores,4.333333,3.9,4.8,3
"Arts, Entertainment & Recreation",Gambling,4.300000,3.9,5.0,3
Human Resources & Staffing,Staffing & Subcontracting,4.228571,3.6,4.5,7
Media & Communication,Publishing,4.200000,4.1,4.3,2
...,...,...,...,...,...
Healthcare,Medical Testing & Clinical Laboratories,3.100000,3.1,3.1,1
Nonprofit & NGO,Religious Institutions,3.100000,3.1,3.1,1
Management & Consulting,Building & Personnel Services,3.100000,2.4,3.8,2
Retail & Wholesale,Other Retail Stores,2.900000,2.7,3.1,3


In [39]:
# Memastikan bahwa count dari 'company_industry' memiliki minimum count > 5
rec_industry = df_recruitments.groupby(['company_sector','company_industry'])['company_rating'].agg(['mean', 'min', 'max', 'count']).sort_values(by='mean', ascending=False)
rec_industry[rec_industry['count'] > 5]

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min,max,count
company_sector,company_industry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Human Resources & Staffing,Staffing & Subcontracting,4.228571,3.6,4.5,7
Human Resources & Staffing,HR Consulting,4.180952,3.4,5.0,21
"Arts, Entertainment & Recreation",Sports & Recreation,4.157143,3.4,4.9,7
Information Technology,Enterprise Software & Network Solutions,4.150962,2.9,5.0,104
Education,Colleges & Universities,4.135294,3.6,4.5,17
Information Technology,Computer Hardware Development,4.098765,2.7,5.0,81
Information Technology,Software Development,4.080952,1.3,5.0,42
Government & Public Administration,National Agencies,4.069231,3.0,4.9,13
Information Technology,Information Technology Support Services,4.034597,2.0,5.0,211
Aerospace & Defense,Aerospace & Defense,3.997674,3.5,4.8,43


Dari hasil fungsi agregasi diatas, kita bisa melihat bahwa melalui kategori `company_type` yang memiliki nilai rata-rata rating yang paling tinggi merupakan kelompok 'College/Unviersity'. Namun perlu dilihat bahwa count dari kategori tersebut bisa dibilang sedikit. Perlu dilihat 'Company - Private' dan 'Company - Public' memiliki rating yang yang relatif tinggi 3.9 dan 3.8 dengan jumlah count yang banyak (700 & 500).

Bila kita mengagregasi dengan kolom kategori `company_sector`, kita bisa melihat bahwa kategori 'Human resources & Staffing' merupakan kategori dengan nilai rating tertinggi. Namun, sekali lagi terdapat kategori yang memiliki jumlah yang banyak (500) dan memiliki angka signifikan (4.0) yaitu 'Information Technology'.

Hal ini konsisten dengan `company_industry` sebagai sub-kategori dari `company_sector`. Walaupun bila di agregat secara langsung kita menumukan rata-rata tertinggi berada di kategori 'consumer electronics' & 'automotive store', mereka memiliki jumlah yang sedikit (kurang dari lima). Ketika kita melakukan filtering, baru hasil konsisten dengan dipuncaki `company_industry` yang memiliki kategori `company_sector` 'Human Resources & Staffing` dan `Information Technology`.

### Monthy Salary

Melakukan agregasi nilai 'monthly_salary', dengan mengroup dalam kolom kategorikal kita 'company_type', 'company_industry', dan 'company_sector'.

In [40]:
df_recruitments.groupby('company_type')['monthly_salary'].agg(['mean', 'min', 'max', 'count']).sort_values(by='mean', ascending=False)

TypeError: agg function failed [how->mean,dtype->object]

In [23]:
df_recruitments.groupby('company_sector')['monthly_salary'].agg(['mean', 'min', 'max', 'count']).sort_values(by='mean', ascending=False)

Unnamed: 0_level_0,mean,min,max,count
company_sector,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Telecommunications,11864.648889,8071.33,16550.0,9
Hotels & Travel Accommodation,11239.585,8500.0,12291.67,4
Information Technology,10484.207359,3897.0,28416.67,390
Human Resources & Staffing,10176.908276,4760.42,16666.67,29
Management & Consulting,10114.562985,4330.0,20208.33,67
Media & Communication,9926.274815,6188.17,14787.5,27
Pharmaceutical & Biotechnology,9913.469167,5306.67,17500.0,24
Aerospace & Defense,9428.791875,6183.5,15833.33,32
Restaurants & Food Service,9425.492727,4854.42,12788.17,11
Financial Services,9272.7698,4676.4,15487.5,100


In [24]:
df_recruitments.groupby(['company_sector','company_industry'])['monthly_salary'].agg(['mean', 'min', 'max', 'count']).sort_values(by='max', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min,max,count
company_sector,company_industry,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Information Technology,Computer Hardware Development,11258.157544,5113.83,28416.67,57
Information Technology,Enterprise Software & Network Solutions,11605.577733,6622.50,21166.67,75
Management & Consulting,Security & Protective,14784.982000,9133.25,20208.33,5
Management & Consulting,Research & Development,9763.102222,4330.00,19791.67,9
Information Technology,Internet & Web Services,11709.809000,6441.67,19460.00,50
...,...,...,...,...,...
Education,Primary & Secondary Schools,6379.750000,6379.75,6379.75,1
Healthcare,Medical Testing & Clinical Laboratories,5303.920000,5303.92,5303.92,1
Management & Consulting,Membership Organizations,,,,0
Retail & Wholesale,Consumer Electronics & Appliances Stores,,,,0


Dari analisis diatas kita menemukan bahwa walaupun dalam `company_type` terdapat 'Self-employeed' merupakan tertinggi, namun perlu dilihat karena jumlah countnya hanyalah berjumlah 3. Lebih akurat jika kita lihat bahwa 'Company Public' dan 'Company Private' merupakan rata-rata tertinggi.

Fenomena ini kita lihat lagi dalam `company_sector` dan `company_industry`. Dimana dipuncaki oleh 'telecomunications' dan 'hotel & travel acommodation', tetapi berjumlah sedikit. Kita bisa melihat 'information technology' sebagai kategori yang memiliki rata-rata `monthly_salary` tertinggi. Hal ini didukung dengan melihat `company_industy` yang memiliki `company_sector` banyak dalam 'information technology'.