In [1]:
import io
import os
import re
import string
import nltk
import pandas as pd
import docx2txt
from datetime import datetime
from dateutil import relativedelta
from pdfminer.converter import TextConverter
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.layout import LAParams
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFSyntaxError
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from spacy import displacy

In [2]:
import spacy
nlp = spacy.load('en_core_web_lg')

In [3]:
def extract_text_from_pdf(pdf_data):
    '''
    Helper function to extract the plain text from .pdf files
    :return: iterator of string of extracted text
    '''
    try:
        for page in PDFPage.get_pages(
                pdf_data,
                caching=True,
                check_extractable=True
        ):
            resource_manager = PDFResourceManager()
            handle = io.StringIO()
            converter = TextConverter(
                resource_manager,
                handle,
                codec='utf-8',
                laparams=LAParams()
            )
            page_interpreter = PDFPageInterpreter(
                resource_manager,
                converter
            )
            page_interpreter.process_page(page)

            text = handle.getvalue()
            yield text

            # close open handles
            converter.close()
            handle.close()
    except PDFSyntaxError:
        return

In [4]:
def convert_to_utf_8(text_data):
    retval = ""
    for line in text_data:
        line = line.strip()
        line = bytes(line, 'utf-8').decode('utf-8', 'ignore')
        retval += line
    retval = retval.replace("\n", " ").replace("\uf0e0", " ").replace("\uf08c", " ").replace("\uf015", " ")   
    return retval.strip()

In [5]:
data = None
with open("OmkarResume.pdf", "rb") as infile:
    data = list(extract_text_from_pdf(infile))

In [6]:
text = convert_to_utf_8(data)
doc = nlp(text)

In [7]:
text

'Omkar Pathak  SOFTWARE ENGINEER · FULL STACK PYTHON DEVELOPER  \uf10b (+91) 8087996634  Pune, Maharashtra, India |   omkarpathak27@gmail.com |   www.omkarpathak.in  | \uf092 OmkarPathak  |   omkar-pathak-94473811b  “Make the change that you want to see in the world.”  Experience  Schlumberger Pune, Maharashtra, India DATA ENGINEER July 2018 - Present • Responsible for implementing and managing an end-to-end CI/CD Pipeline with custom validations for Informatica migrations which  brought migration time to 1.5 hours from 9 hours without any manual intervention  • Enhancing, auditing and maintaining custom data ingestion framework that ingest around 1TB of data each day to over 70 business  units  • Working with L3 developer team to ensure the discussed Scrum PBI’s are delivered on time for data ingestions • Planning and Executing QA and Production Release Cycle activities Truso FULL STACK DEVELOPER INTERN • Created RESTful apis • Tried my hands on Angular 5/6 • Was responsible for Djang

In [12]:
displacy.render(doc, style="ent")

In [9]:
sents = list(doc.sents)

In [12]:
people = [ee for ee in doc.ents if ee.label_ == 'PERSON']

In [13]:
people

[Omkar Pathak,
 Schlumberger,
 Propeluss,
 Kleiber,
 Raspberry Pi,
 Quora,
 Raspberry Pi,
 Quora]

In [100]:
vocab = list(sents[0].vocab)

In [101]:
dir(vocab[0])

['__class__',
 '__delattr__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__ne__',
 '__new__',
 '__pyx_vtable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'check_flag',
 'cluster',
 'flags',
 'from_bytes',
 'has_vector',
 'is_alpha',
 'is_ascii',
 'is_bracket',
 'is_currency',
 'is_digit',
 'is_left_punct',
 'is_lower',
 'is_oov',
 'is_punct',
 'is_quote',
 'is_right_punct',
 'is_space',
 'is_stop',
 'is_title',
 'is_upper',
 'lang',
 'lang_',
 'like_email',
 'like_num',
 'like_url',
 'lower',
 'lower_',
 'norm',
 'norm_',
 'orth',
 'orth_',
 'prefix',
 'prefix_',
 'prob',
 'rank',
 'sentiment',
 'set_attrs',
 'set_flag',
 'shape',
 'shape_',
 'similarity',
 'suffix',
 'suffix_',
 'text',
 'to_bytes',
 'vector',
 'vector_norm',
 'vocab']