In [13]:
import string
import json
import re

import pandas as pd
import numpy as np



## First let's read all data and create dataframes from them

In [42]:
# read in json line data and create dataframes
companies_info = []
with open('/home/huynhhao/Desktop/job_recommender/crawl_data/companies_info.jl', 'r', encoding = 'utf-8') as f:
    for line in f:
        company_info = json.loads(line)
        companies_info.append(company_info)
        
# create a companies dataframe
companies_df = pd.DataFrame([], columns = ['company_id', 'company_name', 'average_rating', 'num_review' ,
                                           'city', 'type', 'num_employee',  'country', 'working_day', 'OT', 
                                           'overview','expertise', 'benifit', 'logo_link' ])
for company in companies_info:
    companies_df = companies_df.append({'company_id': company['company_name'], 
                                       'company_name': company['name'],
                                        'average_rating': company['average_rating'],
                                        'num_review': company['num_review'],
                                       'city': company['city'],
                                       'type': company['type'], 
                                       'num_employee': company['num_employee'], 
                                       'country': company['country'],
                                       'working_day': company['working_day'], 
                                       'OT': company['OT'], 
                                       'overview': company['overview'],
                                        'expertise': company['expertise'],
                                        'benifit': company['benifit'],
                                       'logo_link': company['logo']},
                                      ignore_index = True)
    
    

jobs_info = []
with open('/home/huynhhao/Desktop/job_recommender/crawl_data/job_info.jl', 'r', encoding = 'utf-8') as f:
    for line in f:
        job = json.loads(line)
        jobs_info.append(job)
        
# job_info dataframe
jobs_df = pd.DataFrame([], columns = ['company_id', 'job_name', 'taglist', 'location', 'three_reasons', 'description'])
for job in jobs_info:
    jobs_df = jobs_df.append({'company_id': job['company_name'],
                             'job_name': job['job_name'],
                             'taglist': job['tag_list'],
                             'location': job['location'],
                             'three_reasons': job['three_reasons'],
                             'description': job['description']},
                             ignore_index = True)

In [46]:
compaines_df = companies_df.fillna('')
jobs_df = jobs_df.fillna('')

### Now we'll preprocess all text data

In [52]:
def remove_tags(text: str, replaced: str = '\n') -> str:
    # remove all html tags in text and return the without-html-tag text
    if text is not None:
        return re.sub(r'<.*?>', replaced, text ).strip()
    return ''
for i in range(len(companies_df)):
    companies_df.loc[i, 'overview'] = remove_tags(companies_df.loc[i, 'overview'])
    companies_df.loc[i, 'expertise'] = remove_tags(companies_df.loc[i, 'expertise'])
#     print(companies_df.loc[i, 'benifit'])
    companies_df.loc[i, 'benifit'] = remove_tags(companies_df.loc[i, 'benifit'])
    


In [55]:
# preprocess jobs_df
for i in range(len(jobs_df)):
    jobs_df.loc[i, 'three_reasons'] = remove_tags(jobs_df.loc[i, 'three_reasons'])
    jobs_df.loc[i, 'description'] = remove_tags(jobs_df.loc[i, 'description'])
    jobs_df.loc[i, 'taglist'] = ' '.join(' '.join(jobs_df.loc[i, 'taglist'].split('\n')).split(' '))

In [72]:
# Check if all the company ids in jobs_df dataframe are actually from the company ids in companies_df dataframe
# which means you don't have a job that is posted by a non-existed company!
set(jobs_df.loc[:, 'company_id'].values).issubset(set(companies_df.loc[:, 'company_id'].values))

True

**So far so good, the preprocessing of companies_df and jobs_df are done here, the remaining works will be left to the user of these dataframe. Let's save them and moving to preprocess the CV dataset**

In [76]:
companies_df_path = '/home/huynhhao/Desktop/job_recommender/data/companies.csv'
jobs_df_path = '/home/huynhhao/Desktop/job_recommender/data/jobs.csv'
companies_df.to_csv(companies_df_path, index = False, encoding = 'utf-8')
jobs_df.to_csv(jobs_df_path, index = False, encoding = 'utf-8')

**Now we'll add unique id for each company and job**

In [32]:
companies_df = pd.read_csv(r'C:\Users\ASUS\Desktop\job_recommender\data\companies.csv')
jobs_df = pd.read_csv(r'C:\Users\ASUS\Desktop\job_recommender\data\jobs.csv')

In [33]:
jobs_df['job_id'] = np.full(len(jobs_df), np.nan)
jobs_df = jobs_df[['job_id', 'company_id', 'job_name', 'taglist', 'location', 'three_reasons', 'description']]


In [34]:
for i in range(len(jobs_df)):
    company_id = jobs_df.loc[i, 'company_id']
    job_name = jobs_df.loc[i, 'job_name'].lower().translate(str.maketrans('', '', string.punctuation))
    job_id = '_'.join(job_name.split())
    jobs_df.loc[i, 'job_id'] = company_id + ':' + job_id


In [39]:
# save the jobs_df again
jobs_df_path = r'C:\Users\ASUS\Desktop\job_recommender\data\jobs.csv'

jobs_df.to_csv(jobs_df_path, index = False, encoding = 'utf-8')

### Now we'll preprocess the CV dataset

In [42]:
cvs = pd.read_csv(r'C:\Users\ASUS\Desktop\job_recommender\data\cvdata\ResumeDataSet.csv', )

In [45]:
cvs.loc[0, 'Resume']

'Skills * Programming Languages: Python (pandas, numpy, scipy, scikit-learn, matplotlib), Sql, Java, JavaScript/JQuery. * Machine learning: Regression, SVM, NaÃ¯ve Bayes, KNN, Random Forest, Decision Trees, Boosting techniques, Cluster Analysis, Word Embedding, Sentiment Analysis, Natural Language processing, Dimensionality reduction, Topic Modelling (LDA, NMF), PCA & Neural Nets. * Database Visualizations: Mysql, SqlServer, Cassandra, Hbase, ElasticSearch D3.js, DC.js, Plotly, kibana, matplotlib, ggplot, Tableau. * Others: Regular Expression, HTML, CSS, Angular 6, Logstash, Kafka, Python Flask, Git, Docker, computer vision - Open CV and understanding of Deep learning.Education Details \r\n\r\nData Science Assurance Associate \r\n\r\nData Science Assurance Associate - Ernst & Young LLP\r\nSkill Details \r\nJAVASCRIPT- Exprience - 24 months\r\njQuery- Exprience - 24 months\r\nPython- Exprience - 24 monthsCompany Details \r\ncompany - Ernst & Young LLP\r\ndescription - Fraud Investigatio

In [46]:
len(cvs)

962