In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import numpy as np
import pandas as pd

In [5]:
df = pd.read_csv('./datasets/freelance-projects.csv')
df.head(3)

Unnamed: 0,Title,Category Name,Experience,Sub Category Name,Currency,Budget,Location,Freelancer Preferred From,Type,Date Posted,Description,Duration,Client Registration Date,Client City,Client Country,Client Currency,Client Job Title
0,Banner images for web desgin websites,Design,Entry ($),Graphic Design,EUR,60.0,remote,ALL,fixed_price,2023-04-29 18:06:39,We are looking to improve the banner images on...,,2010-11-03,Dublin,Ireland,EUR,PPC Management
1,Make my picture a solid silhouette,"Video, Photo & Image",Entry ($),Image Editing,GBP,20.0,remote,ALL,fixed_price,2023-04-29 17:40:28,Hello \n\nI need a quick designer to make 4 pi...,,2017-02-21,London,United Kingdom,GBP,Office manager
2,Bookkeeper needed,Business,Entry ($),Finance & Accounting,GBP,12.0,remote,ALL,fixed_price,2023-04-29 17:40:06,Hi - I need a bookkeeper to assist with bookke...,,2023-04-09,London,United Kingdom,GBP,Paralegal


In [6]:
df = df.drop(['Date Posted', 'Freelancer Preferred From', 'Duration'], axis=1)
df = df.filter(regex=r'^(?!.*Client).*$', axis=1)

experience_mapping = {'Entry ($)': 1, 'Intermediate ($$)': 2, 'Expert ($$$)': 3}
df['Experience'] = df['Experience'].map(experience_mapping)

df.head(3)

Unnamed: 0,Title,Category Name,Experience,Sub Category Name,Currency,Budget,Location,Type,Description
0,Banner images for web desgin websites,Design,1,Graphic Design,EUR,60.0,remote,fixed_price,We are looking to improve the banner images on...
1,Make my picture a solid silhouette,"Video, Photo & Image",1,Image Editing,GBP,20.0,remote,fixed_price,Hello \n\nI need a quick designer to make 4 pi...
2,Bookkeeper needed,Business,1,Finance & Accounting,GBP,12.0,remote,fixed_price,Hi - I need a bookkeeper to assist with bookke...


In [7]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

def preprocess_text(text):

    text = text.lower()
    
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    
    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    clean_text = ' '.join(tokens)
    
    return clean_text


df['Clean Description'] = df['Description'].apply(preprocess_text)
df['Clean Description']


0        looking improve banner images web design pages...
1        hello need quick designer make 4 pictures soli...
2             hi need bookkeeper assist bookkeeping thanks
3        hi need accountant assist understanding regula...
4        hi currently running project require 100 guest...
                               ...                        
12217    looking published travel writer write articles...
12218    website wwwjuicebitzcouk added filters usb cab...
12219       need someone write quick sql query basic table
12220    seeking full stack web developer specializes c...
12221    hi following administrative task would appreci...
Name: Clean Description, Length: 12222, dtype: object

In [8]:
df.head(3)

Unnamed: 0,Title,Category Name,Experience,Sub Category Name,Currency,Budget,Location,Type,Description,Clean Description
0,Banner images for web desgin websites,Design,1,Graphic Design,EUR,60.0,remote,fixed_price,We are looking to improve the banner images on...,looking improve banner images web design pages...
1,Make my picture a solid silhouette,"Video, Photo & Image",1,Image Editing,GBP,20.0,remote,fixed_price,Hello \n\nI need a quick designer to make 4 pi...,hello need quick designer make 4 pictures soli...
2,Bookkeeper needed,Business,1,Finance & Accounting,GBP,12.0,remote,fixed_price,Hi - I need a bookkeeper to assist with bookke...,hi need bookkeeper assist bookkeeping thanks


In [12]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

nlp = pipeline("ner", model=model, tokenizer=tokenizer)

Some weights of the model checkpoint at dslim/bert-base-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [13]:
def merge_subword_tokens(ner_results):
        new_results = []
        for item in ner_results:
            if item['word'].startswith('##'):
                if new_results:
                    new_results[-1]['word'] = new_results[-1]['word'] + item['word'][2:]
                    new_results[-1]['end'] = item['end']
            else:
                new_results.append(item)
        return new_results

In [18]:
def extract_entities(text):

    entity_words = []
    ner_results = nlp(text)
    merged_results = merge_subword_tokens(ner_results)

    for entity in merged_results:
        entity_words.append(entity['word'])

    result = ', '.join(entity_words)

    return result

In [19]:
text = """We are looking for a Data Scientist with expertise in machine learning, deep learning, and statistics. 
Skills required: Python, TensorFlow, SQL, and experience with cloud services like AWS or Azure."""

extract_entities(text)

'Data, Scientist, Python, TensorFlow, SQL, AWS, Azure'

In [26]:
text = df.loc[0, 'Clean Description']
print(text)

looking improve banner images web design pages web design shopify woo commence redesign ecommerce wwwleapdigitalie attaching two images like create images similar design


In [28]:
import pandas as pd

df2 = pd.read_csv('output.csv')
non_empty_skills = df2[df2['Skills'].notnull()]
non_empty_skills['Skills']


137           httpsgonppcom
218              ohdoshcouk
334                      sa
402      httpsbapasmealscom
511                   angel
                ...        
11685              medicare
11691        httpsmosinecom
11851           justhostcom
12043                xtemos
12201       densbergensberg
Name: Skills, Length: 124, dtype: object

In [29]:
for index, row in df.head(10).iterrows():
    text = row['Description']
    entities = extract_entities(text)
    print(f"Row {index}: {entities}")


Row 0: Shopify
Row 1: 
Row 2: 
Row 3: UK
Row 4: D, Doow, No, AI, AI, AI, Macworld, CNBC, Wondersha, Cnet, Tribune, India, Digital, Trends, Techno, B, PC, Mag, tech, Techtarget, The, Windows, Club, MSN, Daily, Star, CO, UK, Forbes, Council, Post, FAST, Company, YEC, Council, Post, Moz, DA, AREFS, DR, SA
Row 5: Database, Google, Drive
Row 6: 
Row 7: UK, UK
Row 8: LGBTQ
Row 9: US, UK, CA


In [30]:
from tqdm import tqdm

df['Ex-Skills'] = ''

for index, row in tqdm(df.iterrows(), total=len(df), desc='Processing'):
    text = row['Description']
    entities = extract_entities(text)
    df.at[index, 'Ex-Skills'] = entities


Processing: 100%|██████████| 12222/12222 [52:58<00:00,  3.84it/s] 


In [31]:
df.to_csv('output_2.csv', index=False)

In [33]:
df.head(10)

Unnamed: 0,Title,Category Name,Experience,Sub Category Name,Currency,Budget,Location,Type,Description,Clean Description,Skills,Entities,Ex-Skills
0,Banner images for web desgin websites,Design,1,Graphic Design,EUR,60.0,remote,fixed_price,We are looking to improve the banner images on...,looking improve banner images web design pages...,,,Shopify
1,Make my picture a solid silhouette,"Video, Photo & Image",1,Image Editing,GBP,20.0,remote,fixed_price,Hello \n\nI need a quick designer to make 4 pi...,hello need quick designer make 4 pictures soli...,,,
2,Bookkeeper needed,Business,1,Finance & Accounting,GBP,12.0,remote,fixed_price,Hi - I need a bookkeeper to assist with bookke...,hi need bookkeeper assist bookkeeping thanks,,,
3,Accountant needed,Business,1,Tax Consulting & Advising,GBP,14.0,remote,fixed_price,Hi - I need an accountant to assist me with un...,hi need accountant assist understanding regula...,,,UK
4,Guest Post on High DA Website,Digital Marketing,3,SEO,USD,10000.0,remote,fixed_price,"Hi, I am currently running a project where I w...",hi currently running project require 100 guest...,,,"D, Doow, No, AI, AI, AI, Macworld, CNBC, Wonde..."
5,Content Database Project for Travel Company,Technology & Programming,3,Databases,EUR,500.0,remote,fixed_price,Brief\nThe requirements of this brief is to fi...,brief requirements brief find solution manage ...,,,"Database, Google, Drive"
6,Make web site for Tutoring company,Design,1,Web Design,USD,10.0,remote,fixed_price,I need to build web site for my tutoring compa...,need build web site tutoring company simple de...,,,
7,Seeking Highly Skilled Freelancers,Technology & Programming,1,Website Development,GBP,10.0,remote,fixed_price,I am currently working on a new freelancer com...,currently working new freelancer community pro...,,,"UK, UK"
8,E-learning,Design,2,Illustration & Drawing,GBP,0.0,remote,fixed_price,Looking for a quote for an introductory e-lear...,looking quote introductory elearniing course c...,,,LGBTQ
9,19 sentences recording. native english speaker...,Music & Audio,1,Voice-Over,USD,10.0,remote,fixed_price,1. need native speaker from US or UK or CA\n2...,1 need native speaker us uk ca 2 one read 19 s...,,,"US, UK, CA"
