In [3]:
# !pip install pytesseract
# !pip install tesseract
# !pip install google-cloud-vision
# !pip install gtts
import nltk
import re
from textblob import TextBlob
import spacy
from gtts import gTTS
import os
from spacy import displacy 
import cv2
import pytesseract
import numpy as np
from nltk.stem import WordNetLemmatizer
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Collecting gtts
  Downloading gTTS-2.2.4-py3-none-any.whl (26 kB)
Installing collected packages: gtts
Successfully installed gtts-2.2.4


You should consider upgrading via the 'c:\users\ppetropo\appdata\local\programs\python\python39\python.exe -m pip install --upgrade pip' command.


# Digital Image

In [4]:
# read image
img = cv2.imread('Instagram.jpg')
img2 = cv2.imread('handwritten1.jpg')
# gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) 
# cv2.imshow(img)
config = r'--oem 3 --psm 6'

# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 


# Lemmatization

In [None]:
def funclemmatize(tokens):
    lemmatizer = WordNetLemmatizer()

#     toks = customTokenizer(sentence)
    for token in tokens:
          print(token, '->', lemmatizer.lemmatize(token))

# Stemming with PorterStemmer

In [None]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

def porterStem(tokens): 
    ps = PorterStemmer()
#     tokenss = customTokenizer(sntnc)

    for word in tokens:
        print(word, '->', ps.stem(word))

# Stemming with SnowballStemmer

In [None]:
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize

def snowballStem(tokens, lng):
    sb = SnowballStemmer(lng)
#     words = customTokenizer(sntnc)

    for wd in tokens:
        print(wd, '->', sb.stem(wd))

In [20]:
gray = get_grayscale(img)
thresh = thresholding(gray)
opening = opening(gray)
canny = canny(gray)

In [5]:
pytesseract.pytesseract.tesseract_cmd = 'C:/Users/ppetropo/AppData/Local/Programs/Tesseract-OCR/tesseract.exe'

text = pytesseract.image_to_string(img, config=config)
# print text
text = text.split('\n')
print(text)
text = ' '.join(text)
print(text)

['L ive in Luv.', 'Fvery day [ go', 'to work ly Lis', 'Also I would', 'like £0 visit Hars', '']
L ive in Luv. Fvery day [ go to work ly Lis Also I would like £0 visit Hars 


# Text Analysis using NLTK

In [6]:
t = nltk.tokenize.word_tokenize(text)
print(t)
nltk.pos_tag(t)

['L', 'ive', 'in', 'Luv', '.', 'Fvery', 'day', '[', 'go', 'to', 'work', 'ly', 'Lis', 'Also', 'I', 'would', 'like', '£0', 'visit', 'Hars']


[('L', 'NNP'),
 ('ive', 'NN'),
 ('in', 'IN'),
 ('Luv', 'NNP'),
 ('.', '.'),
 ('Fvery', 'NNP'),
 ('day', 'NN'),
 ('[', 'NNP'),
 ('go', 'VBP'),
 ('to', 'TO'),
 ('work', 'VB'),
 ('ly', 'NN'),
 ('Lis', 'NNP'),
 ('Also', 'RB'),
 ('I', 'PRP'),
 ('would', 'MD'),
 ('like', 'VB'),
 ('£0', 'NNP'),
 ('visit', 'NN'),
 ('Hars', 'NNP')]

# Text Analysis using SpaCy

In [7]:

# load english language model
nlp = spacy.load('en_core_web_sm',disable=['ner','textcat'])
# create spacy 
doc = nlp(text)

for token in doc:
    print(token.text,'->',token.pos_)

L -> NOUN
i -> PRON
ve -> VERB
in -> ADP
Luv -> PROPN
. -> PUNCT
Fvery -> NOUN
day -> NOUN
[ -> PUNCT
go -> VERB
to -> PART
work -> VERB
ly -> ADP
Lis -> PROPN
Also -> ADV
I -> PRON
would -> AUX
like -> VERB
£ -> SYM
0 -> NUM
visit -> NOUN
Hars -> NOUN


In [8]:
for token in doc:
    # check token pos
    if token.pos_=='NOUN':
        # print token
        print(token.text)

L
Fvery
day
visit
Hars


In [10]:

displacy.render(doc, style='dep',jupyter=True)

# Text Analysis using TextBlob

In [12]:
tb = TextBlob(text)
corrected = tb.correct()
print(corrected)

L give in But. Very day [ go to work ly His Also I would like £0 visit Wars 


# Text Analysis using Custom Tokenizer

In [None]:
def customTokenizer(text):
    pattern = re.compile(r'(?:[A-Z]\.)+|[A-Za-z]+\.|\d+(?:[\./]\d+)|n\'t|\b\w+(?!\'t)|\w+(?:-\w+)*|[!\"#$%&\'()*+,./:;<=>?@[\]^_`{|}~]|-{2}')  
    tmpTokens = pattern.findall(text)
    tokens = list()
    for i in tmpTokens:
        i = i.strip()
        i = i.replace('/','\\/')
        tokens.append(i)
    
    return tokens

x = customTokenizer(sentence)
print(x)

# Handwritten Image

In [None]:
text = pytesseract.image_to_string(img, config=config)
# print text
text = text.split('\n')
print(text)
text = ' '.join(text)
# print("With NLTK: ", text)
t = nltk.tokenize.word_tokenize(text)
print("With NLTK: ",t)
nltk.pos_tag(t)

# load english language model
nlp = spacy.load('en_core_web_sm',disable=['ner','textcat'])
# create spacy 
doc2 = nlp(text)
print("With SpaCy")
for token in doc:
    print(token.text,'->',token.pos_)

displacy.render(doc2, style='dep',jupyter=True)

# Text From Image - Text to Audio

In [13]:

img = cv2.imread('Instagram.jpg')

img = cv2.resize(img, (600, 360))
hImg, wImg, _ = img.shape

boxes = pytesseract.image_to_boxes(img)
xy = pytesseract.image_to_string(img)
for b in boxes.splitlines():
    b = b.split(' ')

x, y, w, h = int(b[1]), int(b[2]), int(b[3]), int(b[4])
cv2.rectangle(img, (x, hImg - y), (w, hImg - h), (50, 50, 255), 1)
cv2.putText(img, b[0], (x, hImg - y + 13), cv2.FONT_HERSHEY_SIMPLEX, 0.4, (50, 205, 50), 1)

cv2.imshow('Detected text', img)

audio = gTTS(text = xy, lang = 'en', slow = False)
audio.save("saved_audio.wav")
os.system("saved_audio.wav")

0