# A transfer-learning based resume classification to find best position to apply.

## Exploratory Data Analysis

### Data classes

In [12]:
import pathlib
cwd = pathlib.Path('.')
data_dir = cwd/'data'

sub_dirs = [d.stem for d in data_dir.iterdir() if d.is_dir()]
print(f'Total {len(sub_dirs)} classes: {sub_dirs}')

Total 15 classes: ['Administration', 'Audit', 'Compliance', 'Corp Accounting, finance', 'Cosec', 'Fund Accounting', 'Investment', 'Investor Relations', 'Legal', 'Operations', 'Personal', 'Random', 'Sales', 'Strategy', 'Trust']


We have total 15 classes:
- 'Administration'
- 'Audit'
- 'Compliance'
- 'Corp Accounting, finance'
- 'Cosec'
- 'Fund Accounting'
- 'Investment'
- 'Investor Relations'
- 'Legal'
- 'Operations'
- 'Personal'
- 'Random'
- 'Sales'
- 'Strategy'
- 'Trust'

### Number of data in each classes

In [13]:
import pandas as pd

classes = dict()
for d in data_dir.iterdir():
    classes[d.stem] = len(list(d.glob('*')))

class_df = pd.DataFrame.from_dict(classes, orient='index', columns=['count'])

In [19]:
import plotly.express as px
fig = px.bar(class_df, x=class_df.index, y='count',
             hover_data=['count'], color='count',
             labels={'count':'Number of files in each class', 'x' : 'Job position'}, height=400)
fig.show()

### File formats

In [15]:
file_extensions = set()
for file in (data_dir/'Administration').iterdir():
    if file.suffix not in file_extensions:
        file_extensions.add(file.suffix)
        
print(file_extensions)

{'.pdf'}


There are three kinds of the format in our data: `.doc`, `.docx`, `.pdf`. It is normal because these are three resume formats that companies accept during the application process.

### Data annotation

There was no need to annotate the data because we have data already labeled by the position for which they have applied.

## Data preprocessing

Since we have three file formats, we converted `.doc` and `.docx` files into `.pdf` file.

In [16]:
import win32com.client
def wordToPdf(files):
    """
    Input:
        files - list of PosixPath
    This method converts doc or docx files in the list into pdf file.
    After conversion, it deletes each file.
    """
    word = win32com.client.DispatchEx('Word.Application')
    for file in files:
        file_path = str(file.resolve())
        print(file_path)
        doc = word.Documents.Open(file_path)
        doc.SaveAs(f"{file_path.split('.')[0]}.pdf", FileFormat=17)
        doc.Close()
        file.unlink()
    word.Quit()

In [17]:
for d in data_dir.iterdir():
    wordToPdf(d.glob("*.doc"))
    wordToPdf(d.glob("*.docx"))

E:\Course works\IS 725\resume-classification\data\Fund Accounting\Acc Coy_Esther Hong Lei_Sr Accountant.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\ADF_John Ong_FC.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Adval_Clarence Ku_CFO.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\AIA IM_Tan Chia Peay_AM.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Alpha Inv_Lia Ngeow_Accountan.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Alpha Inv_Wong Shekfong_Accountant.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Ananda_Mohamed Asif_Financial Assistant.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Apex_Fern Siang Gan_Sr Fund Acc.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Apex_Koh Pi Hann_Senior Fund Accountant.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Apex_Koh Pi Hann_SFA_Hedge.

E:\Course works\IS 725\resume-classification\data\Fund Accounting\Deloitte_Jason Lam_Audit Senior.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\EC World_Ann Wang_Senior Accountant.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\EC World_Jane Huang_Finance Head.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Ecovis Assurance_Berenice_Phang_Audit Associate.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Equinoxe_WeiYang Seto_Account Mgr.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Equis_Tan Huey Jen_FM.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Ewart_Chee Yong Ling_FM.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\EY_Alice Wong_Advisory Manager.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\EY_Bernice Cheung_Audit Senior.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\EY_Gareth Tse

E:\Course works\IS 725\resume-classification\data\Fund Accounting\Nexia_Darryl Davidson_Audit Associate.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Nezu Asia Capital_Ranee Yao_Fund Accounting.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Nezu Asia_Ranee Yao_Senior FA.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Noah_Wai Yan Wong_FA AM.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Noble Group_Darrell Lee_Treasury Manager.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\NYK Group_Stanley Ong_Accounts Exec.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\OCBC Property_Mavis Goh_Sr FM.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Ogilvy_ShiHui Low_Finance Exec.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Orange Grove_Mei Foong Chean_Finance Manager.doc
E:\Course works\IS 725\resume-classification\data\F

E:\Course works\IS 725\resume-classification\data\Fund Accounting\Tricor_Kelvin Fung_Acc Officer.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Tricor_Ken Choi_Supervisor.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Trusted Source_EkJun Goh_Finance Officer.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Tsang Kuen Lee_Jenny Tsang_Financial Reporting Analyst.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Tung Tai_Kitty Han_Acc Officer.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\UBS_Ng Wei Hao_Hedge Fund Accountant.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\UOB AM_Marco Chan_Ops VP.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\UOB_Sally Tan_VP.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Verdant Capital_Michelle Wong_FM_PE.doc
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Vist

E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Hiral Zaveri_Fund Acc Supervisor.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Isa Razaleigh_FA_Hedge.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Jencus Lin_Fund Acc.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Jonathan Yeo_FA_Hedge.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Kelly Low_SFA_Hedge.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Nicholas Chuang_TL_Hedge.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Nicholas Yuen_FA_Hedge.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Yeo Jun Wei_AVP_Hedge.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\CITCO_Yeow Yu You_Hedge Fund Accountant.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Citi_Jasmine Tan_SF

E:\Course works\IS 725\resume-classification\data\Fund Accounting\KPMG_Lee Siao Wah_AM.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\KPMG_Mabel Lee_Audit Senior.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\KPMG_Matthias Schuenhoff_SM.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\KPMG_Mui Yi Chua_Audit Manager.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\KPMG_Mui Yi Chua_Manager.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\KPMG_Siok Kuan Lee_Audit AM.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\L Catterton_Yeo Kang Nian_VP Finance.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Lane & Associates_Rachel Lin_Asst Accountant.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Langham_Jonathan Tam_FA_PERE.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Leung&So_Lui Wing Chan_

E:\Course works\IS 725\resume-classification\data\Fund Accounting\SIA_Danny Hoon_Cabin Crew.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Silverdale_Avinash Kothari_Senior Manager.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\SIM_Keith Teo_Fresh Grad.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Sinotec_ELENA WAI_Asst Accountant.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Southern Capital_Jazz Wee_Fund Acc_PE.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\Southwest Securities_Xidan Zhang_Asst Accountant.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\SS&C_Glenn Goh_Fund Acc AM.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\SS&C_Goh Lee Mien_Mgr_PE.docx
E:\Course works\IS 725\resume-classification\data\Fund Accounting\SS&C_Leong Kit Lu_Fund Accountant.docx
E:\Course works\IS 725\resume-classification\data\Fund Accoun

E:\Course works\IS 725\resume-classification\data\Investor Relations\Vistra_Kelvin Peck_AIS Sr Ass.docx
E:\Course works\IS 725\resume-classification\data\Legal\AZB_Tanvi Sinha_Senior Associate.doc
E:\Course works\IS 725\resume-classification\data\Legal\Bunge_Colin Tan_Senior Legal Counsel.doc
E:\Course works\IS 725\resume-classification\data\Legal\CIMB_Chen Weiquan_Legal AVP.doc
E:\Course works\IS 725\resume-classification\data\Legal\Drew & Napier_Eddie Chua_Support Lawyer.doc
E:\Course works\IS 725\resume-classification\data\Legal\Lawhub_Dianne Loke_Lawyer.doc
E:\Course works\IS 725\resume-classification\data\Legal\Maybank_Valerie Lai_Legal Counsel.doc
E:\Course works\IS 725\resume-classification\data\Legal\RBS_Ulric 0Yeo_Legal Counsel.doc
E:\Course works\IS 725\resume-classification\data\Legal\RGE Group_Yaping Chen_Legal Counsel.doc
E:\Course works\IS 725\resume-classification\data\Legal\RHT_James Loh_Senior Legal Manager.doc
E:\Course works\IS 725\resume-classification\data\Legal\Ro

E:\Course works\IS 725\resume-classification\data\Operations\SCB_Sergey Kasyan_Quantitative Analyst.docx
E:\Course works\IS 725\resume-classification\data\Operations\State Street_Ronald Cheng_MO Associate_Hedge.docx
E:\Course works\IS 725\resume-classification\data\Operations\Syniverse Technologies_Karen Sim_Exec Dir.docx
E:\Course works\IS 725\resume-classification\data\Operations\UOB_Jean Tan_Margin Credit.docx
E:\Course works\IS 725\resume-classification\data\Operations\Ward Ferry_Thomas Lee_Ops Manager.docx
E:\Course works\IS 725\resume-classification\data\Operations\WPP_Teo Qiu Gui_Transaction Service Mgr.docx
E:\Course works\IS 725\resume-classification\data\Personal\BOCOM_Andy Chan_SM Inv Ops.doc
E:\Course works\IS 725\resume-classification\data\Personal\BTMU_Ruchita Shah_Compliance Deputy Manager.doc
E:\Course works\IS 725\resume-classification\data\Personal\EY_Harmony Tee_Audit Associate.doc
E:\Course works\IS 725\resume-classification\data\Personal\Hypermarket_Nqobile Mthobi_

E:\Course works\IS 725\resume-classification\data\Trust\Henderson_Antoneu Tan_Ops Head.doc
E:\Course works\IS 725\resume-classification\data\Trust\ICBC_Glenn Lee_Fund Accountant.doc
E:\Course works\IS 725\resume-classification\data\Trust\J B Boda_Terence Lee_Acc Exec.doc
E:\Course works\IS 725\resume-classification\data\Trust\Lim & Tan Securities_Angeline Tee_Finance Officer.doc
E:\Course works\IS 725\resume-classification\data\Trust\McCabe_Jasmine Tang_Cosec.doc
E:\Course works\IS 725\resume-classification\data\Trust\Offshore_Ramanis_Corp Svcs Manager.doc
E:\Course works\IS 725\resume-classification\data\Trust\Origins Corporate_Emlyn Goh_Accountant.doc
E:\Course works\IS 725\resume-classification\data\Trust\RBS_Anna Nowotarski_RM.doc
E:\Course works\IS 725\resume-classification\data\Trust\Rockwills_JingNeng Lim_Drafter.doc
E:\Course works\IS 725\resume-classification\data\Trust\Rockwills_Z CHEONG_Legal Exec.doc
E:\Course works\IS 725\resume-classification\data\Trust\Standchart_Dorothy

### Excluded data

- E:\Course works\IS 725\resume-classification\data\Corp Accounting, finance\Yacht Club_Wendy Chan_AP Supervisor.doc