In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('dataset/marketing_sample_for_naukri_com-jobs__20190701_20190830__30k_data.csv')


# Dataset Preparation


### Drop unused columns

In [2]:
df = df.drop(['Uniq Id', 
              'Crawl Timestamp', 
              'Job Salary', 
              'Job Experience Required', 
              'Location', 
              'Functional Area', 
              'Industry'], axis=1)


### Drop rows which are not the "Programming & Design" category
note that, "category" = "role" in dataset


In [3]:
df = df.drop(df[(df['Role Category'] != 'Programming & Design')].index)

### After that, drop "Role Category" column
df = df.drop('Role Category', axis=1)
df = df.dropna()


### Trim and split Job Title, Key Skills


In [4]:
df['Job Title'] = df['Job Title'].str.strip()
df['Key Skills'] = df['Key Skills'].str.strip()
df['Key Skills'] = df['Key Skills'].str.split("|")

In [5]:
### Display all categories
df['Role'].unique()

array(['Testing Engineer', 'System Analyst', 'Technical Architect',
       'Software Developer', 'Graphic/Web Designer', 'Project Lead',
       'Team Lead/Technical Lead', 'Release Manager', 'Product Manager',
       'Database Architect/Designer'], dtype=object)

In [6]:
### reset dataframe index after everything done
df = df.reset_index(drop=True)

df.head()

Unnamed: 0,Job Title,Key Skills,Role
0,Software Test Engineer -hyderabad,"[manual testing, test engineering, test case...",Testing Engineer
1,Opening For Adobe Analytics Specialist,"[adobe experience manager, digital, digital ...",System Analyst
2,Opportunity For Azure Devops Architect For Hyd...,"[TFS, Azure, Git, VSTS, Docker, DynaTrace...",Technical Architect
3,Oracle IDAM,"[Oracle IDAM, OIM, OAM]",Software Developer
4,Magento Developer,"[Copyright, Email, jQuery, XML, Javascript...",Software Developer



## Initial values


In [7]:
roles = ['Testing Engineer',
         'System Analyst',
         'Technical Architect',
         'Software Developer',
         'Graphic/Web Designer',
         'Project Lead',
         'Team Lead/Technical Lead',
#          'Release Manager',
         'Product Manager',
         'Database Architect/Designer']

categories = ['cate_data', 
              'cate_developer', 
              'cate_tester', 
              'cate_design', 
              'cate_manager', 
              'cate_analyst']

cate_data = ['Data Engineer',
             'Data Scientist',
             'Data Architect',
             'Data Analyst',
             'Database Administrator',
             'Machine Learning']

cate_developer = ['Software Engineer',
                  'Devops',
                  'Backend',
                  'Frontend',
                  'Full stack',
                  'iOS',
                  'Android']

cate_tester = ['Software Tester',
               'Quality Assurance Engineer']

cate_design = ['UX/UI Designer',
               'Graphic Designer']

cate_manager = ['Product Owner',
                'Project Manager']

cate_analyst = ['System Analyst',
                'Business Analyst']

forbidden_skills = ['development',
                    'design',
                    'web',
                    'css3',
                    'html5',
                    'operations',
                    'management',
                    'project leader',
                    'architect',
                    'architecture',
                    'testing',
                    'software',
                    'tools',
                    'quality',
                    'support',
                    'application',
                    'applications',
                    'developer',
                    'technical',
                    'automation',
                    'graphics',
                    'phd',
                    'email',
                    'apple',
                    'research',
                    'iphone',
                    'ipad']

## Extract Key Skills function
count skills by given set of "Key Skills" in job's dataframe

In [8]:
#
##
### input like this -> extractSkill(dataframe['Key Skills'])
##
#

def extractSkill(df):

    ### trim each Key Skill in each Title
    vowel = []
    for x in df:
        for i in range(len(x)):
            vowel.append(x[i].strip().lower())

    ### Counting elements
    elements_count = {}
    # iterating over the elements for frequency
    for element in vowel:
        # checking whether it is in the dict or not
        if element in elements_count:
            # incerementing the count by 1
            elements_count[element] += 1
        else:
            # setting the count to 1
            elements_count[element] = 1
        
    elements_count = dict(sorted(elements_count.items(), key=lambda item: item[1], reverse=True))

#     # printing the elements frequencies
#     for key, value in elements_count.items():
#         print(f"{key}: {value}")
        
    return elements_count


## Replace and Remove Skills function
input: dataframe with "skill" and "sum" columns

In [22]:
def cleanSkillKeywords(df):
    df = df.drop(df[(df['sum'] == 1)].index)
    
    df.loc[(df['skill'] == "Programming"), "skill"] = "coding"
    
    df.loc[(df['skill'] == "test automation"), "skill"] = "automation testing"
    
    df.loc[(df['skill'] == "data scientist"), "skill"] = "data science"
    
    df.loc[(df['skill'] == "asp") | 
           (df['skill'] == "asp.net") | 
           (df['skill'] == "asp.net mvc"), "skill"] = ".net"
    
    df.loc[(df['skill'] == "natural language processing"), "skill"] = "nlp"
    
    df.loc[(df['skill'] == "advanced analytics") | 
           (df['skill'] == "analytical") | 
           (df['skill'] == "analyst"), "skill"] = "analytics"
    
    df.loc[(df['skill'] == "front end"), "skill"] = "frontend"
    
    df.loc[(df['skill'] == "web technologies") | 
           (df['skill'] == "web application development") | 
           (df['skill'] == "web application"), "skill"] = "web development"
    
    df.loc[(df['skill'] == "ios"), "skill"] = "ios development"
    
    df.loc[(df['skill'] == "android") | 
           (df['skill'] == "android application development") | 
           (df['skill'] == "android application"), "skill"] = "android development"
    
    df.loc[(df['skill'] == "mobile") | 
           (df['skill'] == "mobile application") | 
           (df['skill'] == "mobile development") | 
           (df['skill'] == "mobile applications"), "skill"] = "mobile application development"
    
    df.loc[(df['skill'] == "user interface designing") | 
           (df['skill'] == "ui") | 
           (df['skill'] == "ui designer") | 
           (df['skill'] == "user interface") | 
           (df['skill'] == "user interface designer"), "skill"] = "ui designing"
    
    df.loc[(df['skill'] == "ux designer") | 
           (df['skill'] == "ux") | 
           (df['skill'] == "user experience"), "skill"] = "ux designing"
    
    df.loc[(df['skill'] == "graphic designer"), "skill"] = "graphic designing"
    
    df.loc[(df['skill'] == "product manager"), "skill"] = "product management"
    
    df = df[~df['skill'].isin(forbidden_skills)]
    
    df = df.groupby('skill')['sum'].agg(['sum']).sort_values(by=['sum'], ascending=False).reset_index() 
    
    return df

## Match between Jobs function
method: set or sort<br>
percent: integer (0 -> 100)

In [10]:
from fuzzywuzzy import fuzz

def matchJobs(title, method, percent):
    
    def get_ratio_token_set(df):
        df_column = df['Job Title']
        return fuzz.token_set_ratio(title, df_column)
    
    def get_ratio_token_sort(df):
        df_column = df['Job Title']
        return fuzz.token_sort_ratio(title, df_column)

    if method == "set":
        match_data = df[df.apply(get_ratio_token_set, axis=1) >= percent]
    elif method == "sort":
        match_data = df[df.apply(get_ratio_token_sort, axis=1) >= percent]
        
    return match_data


# Job Skills


In [23]:
df_match = matchJobs(title="Product Owner", method="set", percent=90)
list_match = extractSkill(df_match['Key Skills'])
df_recommend = pd.DataFrame(list_match.items(), columns = ['skill', 'sum'])
df_recommend = cleanSkillKeywords(df_recommend)
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_recommend)

Unnamed: 0,skill,sum
0,agile,6
1,scrum,4
2,product management,4
3,jira,3
4,monitoring,3
5,analytics,3
6,computer science,3
7,telecom,2
8,product strategy,2
9,pdf,2


### Export every jobs to .csv

In [14]:
for category in categories:
    for job in eval(category):

        df_match = matchJobs(title=job, method="set", percent=90)
        
        list_match = extractSkill(df_match['Key Skills'])
        df_recommend = pd.DataFrame(list_match.items(), columns = ['skill', 'sum']) 
        df_recommend = cleanSkillKeywords(df_recommend)
        
#         with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#             display(df_recommend)
        job_path = job.replace("/", ":")
        df_recommend.to_csv('exports/jobskill/' + category + '_' + job_path + '.csv', index=True)
        
        


# Category Skill (New)


In [15]:
df_category_skill = pd.DataFrame(columns = ['skill', 'sum'])

for job in cate_developer:
    
    df_match = matchJobs(title=job, method="set", percent=90)

    list_match = extractSkill(df_match['Key Skills'])
    df_cat = pd.DataFrame(list_match.items(), columns = ['skill', 'sum']) 
    df_cat = df_cat.drop(df_cat[(df_cat['sum'] == 1)].index)
    df_cat = cleanSkillKeywords(df_cat)
    df_cat = df_cat.groupby('skill')['sum'].agg(['sum']).reset_index() 
    df_category_skill = df_category_skill.append(df_cat)

df_category_skill = df_category_skill.groupby('skill')['sum'].agg(['sum','count']).sort_values(by=['count', 'sum'], ascending=False).reset_index()
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(df_category_skill)
    
# df_category_skill.to_csv('out.csv', index=True)


Unnamed: 0,skill,sum,count
0,javascript,398,7
1,java,286,7
2,sql,198,7
3,mysql,175,7
4,agile,123,7
5,git,115,7
6,debugging,99,7
7,rest,82,7
8,api,46,7
9,python,236,6



# Standard Skills (old)


In [14]:

# #
# ## Count Categories in matched job
# #

# df_category_count = df_match['Role'].value_counts().rename_axis('unique_values').reset_index(name='counts')
# df_category_count = df_category_count.head(3)
# df_category_count

In [15]:
# #
# ##
# ### Find Standard Skills for job
# ##
# #

# import math 

# df_standard_skill = pd.DataFrame(columns = ['Skill', 'Count'])
# total = 30
# if(len(df_category_count.index) == 1):
#     x = df_category_count['counts'][0]
#     y = 0
#     z = 0
        
# if(len(df_category_count.index) == 2):
#     x = df_category_count['counts'][0]
#     y = df_category_count['counts'][1]
#     z = 0
    
# if(len(df_category_count.index) == 3):
#     x = df_category_count['counts'][0]
#     y = df_category_count['counts'][1]
#     z = df_category_count['counts'][2]
    
# summation = x+y+z

# count = 0

# for x in df_category_count['unique_values']:
#     cat_input = df_category_count['counts'][count]
#     path = x.replace("/", ":")
#     df_cat = pd.read_csv('data/key skills count/role_' + path + '.csv')
#     df_cat = df_cat.rename(columns={'Unnamed: 0': 'Skill', '0': 'Count'})
    
#     selected = int(math.ceil((cat_input*total)/summation))
    
#     df_cat = df_cat.head(selected)
#     df_standard_skill = df_standard_skill.append(df_cat)
#     count = count + 1

        
# df_standard_skill = df_standard_skill.drop_duplicates(subset=['Skill'])
# df_standard_skill = df_standard_skill[~df_standard_skill['Skill'].isin(forbidden_category_skills)]
# df_standard_skill = df_standard_skill.reset_index(drop=True)
# df_standard_skill


# Common Skills (unused)


In [16]:
# df_common_skill = pd.DataFrame(columns = ['Skill', 'Count'])

# for x in roles:
#     path = x.replace("/", ":")
#     df_cat = pd.read_csv('data/key skills count/role_' + path + '.csv')
#     df_cat = df_cat.rename(columns={'Unnamed: 0': 'Skill', '0': 'Count'})
#     df_cat = df_cat.head(200)
#     df_common_skill = df_common_skill.append(df_cat)
    
# df_common_skill = df_common_skill['Skill'].value_counts().rename_axis('unique_values').reset_index(name='counts')
# df_common_skill = df_common_skill.drop_duplicates(subset=['unique_values'])
# df_common_skill = df_common_skill[~df_common_skill['unique_values'].isin(forbidden_category_skills)]
# df_common_skill = df_common_skill.reset_index(drop=True)
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     display(df_common_skill)

In [17]:
# dash = extractSkill(df_match['Key Skills'])
# df_new = pd.DataFrame(dash.items(), columns = ['Job Title', 'Count']) 
# with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#     display(df_new)

In [18]:
# #
# ## exports counted elements to csv
# #
# df_count = pd.DataFrame.from_dict(elements_count, orient='index')
# df_count.to_csv('out.csv', index=True)