# nltk spacy basics

**Reference**
* https://myrepublica.nagariknetwork.com/news/mining-data-from-social-media-for-good/
* https://github.com/kgyanwal/nltk_spacy


## String formatting 

In [1]:
person = 'ram'
print ('name = {}'.format(person))

name = ram


In [2]:
lib = [('author', 'book', 'Year' ),
       ('Krishana', 'NLP', 2021 ),
       ('Jeremiah', 'CCA150', 2019 ),
       ('Joe', 'Economy', 2020 ),
       ('Raj', 'Sweet Home', 2022 )]

for author, book, year in lib:
    print(f"{author:{10}} {book:{10}} {year}")

author     book       Year
Krishana   NLP        2021
Jeremiah   CCA150     2019
Joe        Economy    2020
Raj        Sweet Home 2022


## Date and Time Formatting 

In [3]:
from datetime import datetime
today = datetime(year=2020, month=3, day=18)
print(today)
print(f"{today:%B %d, %Y}")
'''Check strftime for datetime formatting'''

2020-03-18 00:00:00
March 18, 2020


'Check strftime for datetime formatting'

In [4]:
today = datetime(year=2020, month=3, day=18)
print(today)
print(f"{today:%B %d, %Y}")
'''Check strftime for datetime formatting'''

today = datetime.today()
print("Today's date:", today)

today = datetime.today()

# dd/mm/YY
print (today.strftime("%d/%m/%Y"))
print (today.strftime("%B %d, %Y"))
print (today.strftime("%m/%d/%y"))
print (today.strftime("%b-%d-%Y"))

now = datetime.now()
print (now.strftime("%d-%m-%Y %H:%M:%S"))

2020-03-18 00:00:00
March 18, 2020
Today's date: 2020-03-29 19:05:47.848100
29/03/2020
March 29, 2020
03/29/20
Mar-29-2020
29-03-2020 19:05:47


## Working with text file
* Write a file using : %%writefile [options] filename 
    * options = -a    # to append the file.

* View the file using : %less filename     to view the file.
* Delete the file using : %rm ex_file.txt

In [5]:
%%writefile -a test.txt
Hello, this is a quick test file.
this is a second line of the text file.
this is another line

Appending to test.txt


In [6]:
%less test.txt
##%rm ex_file.txt


In [7]:
myfile = open('test.txt')
myfile.read()
'''you can not call multiple read for the same file, 
if you want to re-read the file you have to use 
seek myfile.seek(0)'''

'you can not call multiple read for the same file, \nif you want to re-read the file you have to use \nseek myfile.seek(0)'

In [8]:
%less test.txt

# Working with pdf file

In [9]:
import PyPDF2
pdf_filename='/Users/gshyam/projects/work_projects/machine_learning/ANPA_dataScience/QuantumComputing2020/Quantum-Internet-A-vision-for-the-road-ahead.pdf' 
pdf1=open(pdf_filename, mode='rb')
pdf_file=PyPDF2.PdfFileReader(pdf1)
info  = pdf_file.getDocumentInfo()
numpage = pdf_file.getNumPages()
print ("total # of pages:",numpage)
print ("pdf info: \n ",info)

total # of pages: 11
pdf info: 
  {'/CreationDate': "D:20181015122325+08'00'", '/Creator': 'Arbortext Advanced Print Publisher 9.1.440/W Unicode', '/ModDate': "D:20191219094152-08'00'", '/Producer': 'Acrobat Distiller 10.1.16 (Windows)', '/Title': 'Science Journals — AAAS'}


## Merge PDFs

In [10]:
import os
from PyPDF2 import PdfFileReader, PdfFileWriter

sample_dir='./sample'

def MergePdfs(fils, outfile):  
    pdf_writer= PdfFileWriter()
    for fil in fils:
        fpath=os.path.join(sample_dir, fil)
        pdf_reader = PdfFileReader(fpath)
        for page in range(pdf_reader.getNumPages()):
            pdf_writer.addPage(pdf_reader.getPage(page) )
            
    out_fil_path=os.path.join(sample_dir, outfile)                            
    fout=open(out_fil_path, 'wb')
    pdf_writer.write(fout)
    fout.close()

paths = ['pdf1.pdf', 'pdf2.pdf']
MergePdfs(paths, 'merged_pdf12.pdf')


## Rotate PDFs

In [11]:
sample_dir='./sample'

def RotatePages(pdffile, outfile):
    fpath=os.path.join(sample_dir, pdffile)
    
    pdf_writer= PdfFileWriter()
    pdf_reader = PdfFileReader(fpath)
    
    page_1 = pdf_reader.getPage(0).rotateClockwise(90)  # add rotated page
    pdf_writer.addPage(page_1)
    
    page_2 = pdf_reader.getPage(1).rotateCounterClockwise(90) # add rotated page
    pdf_writer.addPage(page_2)
    pdf_writer.addPage(pdf_reader.getPage(2)) # unrotated page
    
    out_fil_path=os.path.join(sample_dir, outfile)
    fout=open(out_fil_path, 'wb')
    pdf_writer.write(fout)
    fout.close()

pdf_in  ='pdf2.pdf'
pdf_out ='pdf2_rotated.pdf'
    
RotatePages(pdf_in, pdf_out)
    

## Add Watermark

### Add a seperate Picture as a watermark

In [12]:
def addWatermark(pdffile, outfile, watermark):
    
    f_in  = os.path.join(sample_dir, pdffile)
    f_out = os.path.join(sample_dir, outfile)
    fwatermark = os.path.join(sample_dir, watermark)
    
    watermark_obj = PdfFileReader(fwatermark)
    watermark_page = watermark_obj.getPage(0)
    
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(f_in)
    
    for page in range(pdf_reader.getNumPages()):
        page = pdf_reader.getPage(page)
        page.mergePage(watermark_page)
        pdf_writer.addPage(page)
        
    
    fout=open(f_out, 'wb')
    pdf_writer.write(fout)
    fout.close()

    
pdf_in  ='pdf2.pdf'
pdf_out ='pdf2_watermark.pdf'
pdf_watermark = 'watermark.pdf'

addWatermark(pdf_in, pdf_out, pdf_watermark)
    

### Add one of the pages as a watermark

In [13]:
def addWatermark2(pdffile, outfile):
    
    f_in  = os.path.join(sample_dir, pdffile)
    f_out = os.path.join(sample_dir, outfile)
    
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(f_in)
    
    Npage=pdf_reader.getNumPages()    
    watermark= pdf_reader.getPage( Npage - 1 )
        
    for page in range(Npage-1):
        page = pdf_reader.getPage(page)
        page.mergePage(watermark)
        pdf_writer.addPage(page)
    
    fout=open(f_out, 'wb')
    pdf_writer.write(fout)
    fout.close()
    
pdf_in  ='pdf2.pdf'
pdf_out ='pdf2_watermark2.pdf'

addWatermark2(pdf_in, pdf_out)

## Encrypt a PDF

In [14]:
def EncryptPDF(pdffile, outfile, password):
    
    f_in  = os.path.join(sample_dir, pdffile)
    f_out = os.path.join(sample_dir, outfile)
    
    
    pdf_writer = PdfFileWriter()
    pdf_reader = PdfFileReader(f_in)
    
    Npage=pdf_reader.getNumPages()
    for page in range(Npage):
        pdf_writer.addPage(pdf_reader.getPage(page))
        
    pdf_writer.encrypt(user_pwd=password, owner_pwd=None, use_128bit=True)

    fout=open(f_out, 'wb')
    pdf_writer.write(fout)
    fout.close()
    
pdf_in  ='pdf2.pdf'
pdf_out ='pdf2_encrypted.pdf'
password='1234'
    
EncryptPDF(pdf_in, pdf_out, password)

# Regular Expression

**Syntax**: re.match(pattern, string, flags=0)

Source: https://www.tutorialspoint.com/python/python_reg_expressions.htm

* \d : digit : [0-9]
* \D : non-digit [^0-9]
* \w : alphanumeric : [A-Za-z0-9_]
* \W : Non-alphanumeric  : [^A-Za-z0-9_]
* \s : whitespace : [ \t\r\n\f]
* \S : Non-whitespace : [^ \t\r\n\f]
* re.I : case-insensitive matching.

In [15]:
text = "The phone number of the agent is 408-099-0102. call soon in that phone number!"
print ("408-099-0102" in text)
print ("408-099-0103" in text)

True
False


In [16]:
import re

pattern = 'phone'
print (re.search(pattern, text) )
print (re.match(pattern, text) )

<re.Match object; span=(4, 9), match='phone'>
None


In [17]:
all_matches = re.findall("phone", text)
print (all_matches, len(all_matches))

['phone', 'phone'] 2


In [18]:
for match in re.finditer("phone", text):
    print(match.span())

(4, 9)
(65, 70)


In [19]:
pattern = r'\d\d\d-\d\d\d-\d\d\d\d'

phone_num=re.search(pattern, text) 
phone_num.group()

'408-099-0102'

In [20]:
text1 = "phone number1 is 408-099-2222, phone number2 is 408-199-0000,  phone number3 is 408-299-1111,"

pattern = r'\d\d\d-\d\d\d-\d\d\d\d'
group_phone = re.findall(pattern, text1)
print (group_phone)


['408-099-2222', '408-199-0000', '408-299-1111']


## Quantifiers

**Repetition Cases**
* ruby+ : Match "rub" plus 1 or more ys
* \d{3} : Match exactly 3 digits
* \d{3,}:  Match 3 or more digits
* \d{3,5} :Match 3, 4, or 5 digits


In [21]:
phone = '123-456-0989'
quantifiers = re.search(r'\d{3}-\d{3}-\d{4}', phone)
quantifiers.group()

'123-456-0989'

In [22]:
phrase = "There are 3 numbers 34 inside this 5 sentences"
re.findall(r"[^\d]+", phrase)

['There are ', ' numbers ', ' inside this ', ' sentences']

In [23]:
test_phrase="This is a string! but it has a puncuation. How to remove it?"
mylist = re.findall(r"[^!.?]+", test_phrase)
print (mylist)
list2=''.join(mylist)
print (list2)

['This is a string', ' but it has a puncuation', ' How to remove it']
This is a string but it has a puncuation How to remove it


In [24]:
text = "only find the hyphon-words. Where are the long-ish dash words?"
re.findall(r"[\w]+-[\w]+", text)

['hyphon-words', 'long-ish']