## Importing Libraries 

In [1]:
import spacy
import pdfminer
import re
import os
import pandas as pd
import pdf2txt

## Load the Language Model

In [2]:
nlp = spacy.load("en_core_web_sm")

## Making a function to read pdf files(converting pdf to text file)

In [3]:
def convert_pdf(f):
    output_filename = os.path.basename(os.path.splitext(f)[0]) + ".txt"
    output_filepath = os.path.join("output/txt/", output_filename)
    pdf2txt.main(args=[f, "--outfile", output_filepath])
    print(output_filepath + " saved successfully!!!")
    return open(output_filepath).read()

In [4]:
result_dict = {'name': [], 'phone': [], 'email': [], 'skills': []} 
names = []
phones = []
emails = []
skills = []

## Extracting all the pdf text data

In [5]:
def parse_content(text):
    skillset = re.compile("python|java|sql|hadoop|tableau")
    phone_num = re.compile(
        "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
    )
    doc = nlp(text)
    name = [entity.text for entity in doc.ents if entity.label_ == "PERSON"][0]
    print(name)
    email = [word for word in doc if word.like_email == True][0]
    print(email)
    phone = str(re.findall(phone_num, text.lower()))
    skills_list = re.findall(skillset, text.lower())
    unique_skills_list = str(set(skills_list))
    names.append(name)
    emails.append(email)
    phones.append(phone)
    skills.append(unique_skills_list)
    print("Extraction completed successfully!!!")

## Creating and writing the scraped data from pdf

In [6]:
for file in os.listdir('resumes/'):
    if file.endswith('.pdf'):
        print('Reading.....' + file)
        txt = convert_pdf(os.path.join('resumes/',file))
        parse_content(txt)

Reading.....Alisson ParkerCV.pdf
output/txt/Alisson ParkerCV.txt saved successfully!!!
Alisson Parker-Wright                                                                                                 10/10/1974 

223
alli1414parks@mail.com
Extraction completed successfully!!!
Reading.....Angelica Astrom.pdf
output/txt/Angelica Astrom.txt saved successfully!!!
Fill
someone@example.com
Extraction completed successfully!!!
Reading.....AshleyMilesCV (1).pdf
output/txt/AshleyMilesCV (1).txt saved successfully!!!
Ashley Miles
ashleymiles@memail.com
Extraction completed successfully!!!
Reading.....AshleyMilesCV.pdf
output/txt/AshleyMilesCV.txt saved successfully!!!
Ashley Miles
ashleymiles@memail.com
Extraction completed successfully!!!
Reading.....John DominicCV.pdf
output/txt/John DominicCV.txt saved successfully!!!
John Dominic
johndominic@mail.com
Extraction completed successfully!!!


In [7]:
names

['Alisson Parker-Wright                                                                                                 10/10/1974 \n\n223',
 'Fill',
 'Ashley Miles',
 'Ashley Miles',
 'John Dominic']

In [8]:
phones

["['8569878511']",
 "['(212) 555-1234']",
 "['6592251422']",
 "['6592251422']",
 "['7877756411']"]

In [9]:
emails

[alli1414parks@mail.com,
 someone@example.com,
 ashleymiles@memail.com,
 ashleymiles@memail.com,
 johndominic@mail.com]

In [10]:
skills

["{'java', 'tableau', 'python'}",
 'set()',
 "{'sql', 'tableau'}",
 "{'sql', 'tableau'}",
 "{'hadoop', 'java', 'python'}"]

## Creating Dataframes

In [11]:
result_dict['name'] = names
result_dict['phone'] = phones
result_dict['email'] = emails
result_dict['skills'] = skills

In [12]:
result_dict

{'name': ['Alisson Parker-Wright                                                                                                 10/10/1974 \n\n223',
  'Fill',
  'Ashley Miles',
  'Ashley Miles',
  'John Dominic'],
 'phone': ["['8569878511']",
  "['(212) 555-1234']",
  "['6592251422']",
  "['6592251422']",
  "['7877756411']"],
 'email': [alli1414parks@mail.com,
  someone@example.com,
  ashleymiles@memail.com,
  ashleymiles@memail.com,
  johndominic@mail.com],
 'skills': ["{'java', 'tableau', 'python'}",
  'set()',
  "{'sql', 'tableau'}",
  "{'sql', 'tableau'}",
  "{'hadoop', 'java', 'python'}"]}

In [13]:
result_df = pd.DataFrame(result_dict)
result_df

Unnamed: 0,name,phone,email,skills
0,Alisson Parker-Wright ...,['8569878511'],alli1414parks@mail.com,"{'java', 'tableau', 'python'}"
1,Fill,['(212) 555-1234'],someone@example.com,set()
2,Ashley Miles,['6592251422'],ashleymiles@memail.com,"{'sql', 'tableau'}"
3,Ashley Miles,['6592251422'],ashleymiles@memail.com,"{'sql', 'tableau'}"
4,John Dominic,['7877756411'],johndominic@mail.com,"{'hadoop', 'java', 'python'}"


## Creating a CSV file

In [14]:
result_df.to_csv("Parsed_Data.csv")

## Reading created CSV file

In [15]:
df = pd.read_csv("Parsed_Data.csv")
df

Unnamed: 0.1,Unnamed: 0,name,phone,email,skills
0,0,Alisson Parker-Wright ...,['8569878511'],alli1414parks@mail.com,"{'java', 'tableau', 'python'}"
1,1,Fill,['(212) 555-1234'],someone@example.com,set()
2,2,Ashley Miles,['6592251422'],ashleymiles@memail.com,"{'sql', 'tableau'}"
3,3,Ashley Miles,['6592251422'],ashleymiles@memail.com,"{'sql', 'tableau'}"
4,4,John Dominic,['7877756411'],johndominic@mail.com,"{'hadoop', 'java', 'python'}"
