In [40]:
import pandas as pd

### Reading Raw Data

In [41]:
nodeflair_data = pd.read_csv("../data/raw/nodeflair/nodeflair_jobpostings.csv", index_col=0)

nodeflair_data.head(1)

Unnamed: 0,URL,Date posted,data,Job Title,Company Name,Seniority,Salary,Job Type,Years of Experience,Tech Stack,Job Desc,cleaned_date
0,/jobs/53907,about 11 hours ago,"['ReactJS Developer (Full Stack)', 'Apar Techn...",ReactJS Developer (Full Stack),Apar Technologies,"['Mid', 'Junior']","$6,419 - $8,819 SGD / Monthly",Permanent,2-3 years,"['Docker', 'CloudFoundry', 'Spring', 'SonarQub...",We are looking for a candidate to fill in the ...,2022-03-10


### Functions to clean Years of Experience and Salary to a standardised format

In [42]:
def clean_years_of_experience(text):
    if text == "Information not provided":
        return "NaN"
    elif "At least" in text:
        text = text.replace("At least","")
        return [int(text.split()[0])]
    else:
        years = text.split()[0]
        range = years.split("-")
        lower_limit = int(range[0])
        upper_limit = int(range[1])
        return [lower_limit, upper_limit]

In [43]:
def clean_salary(text):
    if text != "-":
        salary = text.split('SGD')[0]
        range = salary.split("-")
        lower_limit = range[0][1:-1]
        upper_limit = range[1][2:-1]
        return [lower_limit, upper_limit]
    else:
        return "NaN"

In [44]:
def clean_url(text):
    text = 'https://www.nodeflair.com/' + text

    return text

In [45]:
nodeflair_data["URL"] = nodeflair_data["URL"].apply(clean_url)
nodeflair_data["Salary"] = nodeflair_data["Salary"].apply(clean_salary)
nodeflair_data["Years of Experience"] = nodeflair_data["Years of Experience"].apply(clean_years_of_experience)

nodeflair_data.head(1)

Unnamed: 0,URL,Date posted,data,Job Title,Company Name,Seniority,Salary,Job Type,Years of Experience,Tech Stack,Job Desc,cleaned_date
0,https://www.nodeflair.com//jobs/53907,about 11 hours ago,"['ReactJS Developer (Full Stack)', 'Apar Techn...",ReactJS Developer (Full Stack),Apar Technologies,"['Mid', 'Junior']","[6,419, 8,819]",Permanent,"[2, 3]","['Docker', 'CloudFoundry', 'Spring', 'SonarQub...",We are looking for a candidate to fill in the ...,2022-03-10


### Rename Columns and drop irrelevant ones

In [46]:
nodeflair_data.rename(columns={'URL':'url', 
                        'cleaned_date':'date_posted',
                        'Job Title':'job_title',
                        'Company Name':'company_name',
                        'Salary':'salary',
                        'Job Type':'job_type',
                        'Years of Experience': 'years_of_experience',
                        'Tech Stack':'tech_stack',
                        'Job Desc':'job_desc'}, inplace=True)

nodeflair_data.head(1)

Unnamed: 0,url,Date posted,data,job_title,company_name,Seniority,salary,job_type,years_of_experience,tech_stack,job_desc,date_posted
0,https://www.nodeflair.com//jobs/53907,about 11 hours ago,"['ReactJS Developer (Full Stack)', 'Apar Techn...",ReactJS Developer (Full Stack),Apar Technologies,"['Mid', 'Junior']","[6,419, 8,819]",Permanent,"[2, 3]","['Docker', 'CloudFoundry', 'Spring', 'SonarQub...",We are looking for a candidate to fill in the ...,2022-03-10


In [47]:
nodeflair_data.drop(["Date posted", "data", "Seniority"], axis=1, inplace=True)

### Check for duplicates

In [48]:
nodeflair_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2901 entries, 0 to 2900
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   url                  2901 non-null   object
 1   job_title            2901 non-null   object
 2   company_name         2901 non-null   object
 3   salary               2901 non-null   object
 4   job_type             2901 non-null   object
 5   years_of_experience  2901 non-null   object
 6   tech_stack           2901 non-null   object
 7   job_desc             2901 non-null   object
 8   date_posted          2143 non-null   object
dtypes: object(9)
memory usage: 226.6+ KB


In [49]:
nodeflair_data.drop_duplicates(subset=['company_name', 'job_title','job_desc'], inplace=True)
nodeflair_data

Unnamed: 0,url,job_title,company_name,salary,job_type,years_of_experience,tech_stack,job_desc,date_posted
0,https://www.nodeflair.com//jobs/53907,ReactJS Developer (Full Stack),Apar Technologies,"[6,419, 8,819]",Permanent,"[2, 3]","['Docker', 'CloudFoundry', 'Spring', 'SonarQub...",We are looking for a candidate to fill in the ...,2022-03-10
1,https://www.nodeflair.com//jobs/53898,"Manager, SRE",Rakuten Viki,,Permanent,,"['Docker', 'API', 'PagerDuty', 'GKE', 'ELK', '...",The SRE team at Viki is responsible for buildi...,2022-03-10
2,https://www.nodeflair.com//jobs/53894,DevOps Engineer,GovTech,"[5,800, 9,600]",Permanent,[2],"['Docker', 'DockerCompose', 'Fluentd', 'Clair'...",Our team in GovTech works on highly impactful ...,2022-03-10
3,https://www.nodeflair.com//jobs/53893,"VP, System Analyst",United Overseas Bank Limited (UOB),,Permanent,"[10, 15]","['ETL', 'Oracle', 'Experian', 'Strategy', 'Ter...",The Technology and Operations function is comp...,2022-03-10
4,https://www.nodeflair.com//jobs/53891,DevOps & Lab Manager (SG - Edge),Dell Technologies,"[11,000, 22,000]",Permanent,[12],"['Docker', 'Strategy', 'Container', 'Microsoft...",Dell Technologies is seeking an entrepreneuria...,2022-03-10
...,...,...,...,...,...,...,...,...,...
2896,https://www.nodeflair.com//jobs/46156,DevOps Engineer,2C2P,"[7,000, 10,000]",Permanent,[3],"['GitLab', 'HTTP', 'UDP', 'TCP', 'ShellScript'...",2C2P is looking for a .NET DevOps Engineer to ...,2022-01-11
2897,https://www.nodeflair.com//jobs/46153,DevOps Engineer,FINXFLO,,Permanent,,"['Strategy', 'ShellScript', 'CI', 'Shell', 'UN...",Alpha Stone Capital is looking for an amazing ...,2022-01-11
2898,https://www.nodeflair.com//jobs/46151,DevOps Engineer,Quilt.AI,,Permanent,[3],"['Next.js', 'Docker', 'Cloudflare', 'DockerSwa...",As part of a growing team consisting of ML exp...,2022-01-11
2899,https://www.nodeflair.com//jobs/46149,Software Engineer,Zoku Integrated Commerce,,Permanent,[3],"['API', 'Magento', 'CI', 'DOM', 'Node.js', 'No...",We are hiring software engineers with expertis...,2022-01-11


In [50]:
nodeflair_data.to_csv("../data/cleaned/cleaned_nodeflair_jobpostings.csv")