In [6]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
file_docs = []

with open ('sample1.txt') as f:
    tokens = f.read()
    file_docs.append(tokens)

print(file_docs)

['Machine learning (ML) is the study of computer algorithms that improve automatically through experience.[1] It is seen as a subset of artificial intelligence. Machine learning algorithms build a model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.\n\nA subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.[4][5] In its applicati

In [3]:
documents=file_docs
search_terms = 'machine learning'
# search_terms = 'tomato'
# search_terms = 'sewing machine'

vectorizer = TfidfVectorizer(stop_words=stop_words)
vectors = vectorizer.fit_transform([search_terms] + documents)

# Calculate the word frequency, and calculate the cosine similarity of the search terms to the documents
cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()
document_scores = [item.item() for item in cosine_similarities[1:]]  # convert back to native Python dtypes



In [4]:
# Print the top-scoring results and their titles
scores = [(score) for score in zip(document_scores)]
scores


[(0.5842832127691175,)]

### Tryout 2

In [7]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

class LemmaTokenizer:
    """
    Interface to the WordNet lemmatizer from nltk
    """
    ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

In [8]:
tokenizer=LemmaTokenizer()

tokenizer('It was raining cats and dogs in FooBar')

['It', 'wa', 'raining', 'cat', 'and', 'dog', 'in', 'FooBar']

In [15]:
with open ('sample1.txt') as f:
    tokens=f.read()
tokens

'Machine learning (ML) is the study of computer algorithms that improve automatically through experience.[1] It is seen as a subset of artificial intelligence. Machine learning algorithms build a model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.\n\nA subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.[4][5] In its applicatio

In [17]:
documents=[tokens]
search_terms = 'machine learning is a study that improves automatically such as email filtering'
# search_terms = 'sewing machine'

# Initialise TfidfVectorizer with the LemmaTokenizer. Also need to lemmatize the stop words as well
token_stop = tokenizer(' '.join(stop_words))
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)

# Calculate the word frequency, and calculate the cosine similarity of the search terms to the documents
vectors = vectorizer.fit_transform([search_terms] + documents)
cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()

document_scores = [item.item() for item in cosine_similarities[1:]]  # convert back to native Python dtypes

scores = [(score) for score in zip(document_scores)]
perc_scores=scores[0][0]*100
perc_scores

36.557067649429456

In [93]:
search_terms = ['machine learning is a study that improves automatically','machine learning study email']
search_terms[0]

'machine learning is a study that improves automatically'

## multiple queries

In [18]:
search_terms = ['machine learning is a study that improves automatically such as email filtering','machine learning is a study that improves automatically such as email filtering','machine learning','Machine learning (ML) is the study of computer algorithms that improve automatically through experience.[1] It is seen as a subset of artificial intelligence. Machine learning algorithms build a model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.\n\nA subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.[4][5] In its application across business problems, machine learning is also referred to as predictive analytics.']
# search_terms = 'sewing machine'

# Initialise TfidfVectorizer with the LemmaTokenizer. Also need to lemmatize the stop words as well
token_stop = tokenizer(' '.join(stop_words))
vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)

# Calculate the word frequency, and calculate the cosine similarity of the search terms to the documents

for i in range(len(search_terms)):
    vectors = vectorizer.fit_transform([search_terms[i]] + documents)
    cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()

    document_scores = [item.item() for item in cosine_similarities[1:]]  # convert back to native Python dtypes

    scores = [(score) for score in zip(document_scores)]
    perc_scores=scores[0][0]*100
    print(i+1," Corupus :",round(perc_scores,2),"%")
    
#perc_scores







actual='sample1.txt'
query=['machine learning is a study that improves automatically such as email filtering','machine learning is a study that improves automatically such as email filtering','machine learning','Machine learning (ML) is the study of computer algorithms that improve automatically through experience.[1] It is seen as a subset of artificial intelligence. Machine learning algorithms build a model based on sample data, known as "training data", in order to make predictions or decisions without being explicitly programmed to do so.[2] Machine learning algorithms are used in a wide variety of applications, such as email filtering and computer vision, where it is difficult or unfeasible to develop conventional algorithms to perform the needed tasks.\n\nA subset of machine learning is closely related to computational statistics, which focuses on making predictions using computers; but not all machine learning is statistical learning. The study of mathematical optimization delivers methods, theory and application domains to the field of machine learning. Data mining is a related field of study, focusing on exploratory data analysis through unsupervised learning.[4][5] In its application across business problems, machine learning is also referred to as predictive analytics.']
    

1  Corupus : 36.56 %
2  Corupus : 36.56 %
3  Corupus : 52.42 %
4  Corupus : 100.0 %


In [6]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def get_similarity_score(actual,query):
    class LemmaTokenizer:
        
   
        ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
            return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

    tokenizer=LemmaTokenizer()
    with open (actual) as f:
        tokens=f.read()
    #tokenizer(tokens)
    
    documents=tokens
    search_terms = query
    # search_terms = 'sewing machine'

    # Initialise TfidfVectorizer with the LemmaTokenizer. Also need to lemmatize the stop words as well
    token_stop = tokenizer(' '.join(stop_words))
    vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)

    # Calculate the word frequency, and calculate the cosine similarity of the search terms to the documents

    for i in range(len(search_terms)):
        
        vectors = vectorizer.fit_transform([search_terms[i]] + [documents])
        cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()

        document_scores = [item.item() for item in cosine_similarities[1:]]  # convert back to native Python dtypes

        scores = [(score) for score in zip(document_scores)]
        perc_scores=scores[0][0]*100
        print(i+1," Corupus :",round(perc_scores,2),"%")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [21]:
get_similarity_score(actual,query)

1  Corupus : 36.56 %
2  Corupus : 36.56 %
3  Corupus : 52.42 %
4  Corupus : 100.0 %


In [1]:
from Scripts_Tryouts.extract_text_from_pdf import get_text_ocr_pdf

In [2]:
actual_path=r"D:\Hackoff Mod\Machine Learning.pdf"

query_path=r"D:\Hackoff Mod\Deep Learning.pdf"


In [3]:
act=get_text_ocr_pdf(actual_path)
act

'Machine Learning   Machine Learning is the field of study that gives computers the capability to learn without being explicitly programmed. ML is one of the most exciting technologies that one would have ever come across. As it is evident from the name, it gives the computer that makes it more similar to humans: The ability to learn. Machine learning is actively being used today, perhaps in many more places than one would expect. Getting started with Machine Learning Last Updated: 11-05-2020 This article discusses the categories of machine learning problems, and terminologies used in the field of machine learning. Types of machine learning problems There are various ways to classify machine learning problems. Here, we discuss the most obvious ones. 1. On basis of the nature of the learning “signal” or “feedback” available to a learning system •  Supervised learning: The computer is presented with example inputs and their desired outputs, given by a “teacher”, and the goal is to learn 

In [4]:
que=get_text_ocr_pdf(query_path)
que

'Deep learning (also known as deep structured learning) is part of a broader family of machine learning methods based on artificial neural networks with representation learning. Learning can be supervised, semi-supervised or unsupervised.[1][2][3] Deep-learning architectures such as deep neural networks, deep belief networks, recurrent neural networks and convolutional neural networks have been applied to fields including computer vision, machine vision, speech recognition, natural language processing, audio recognition, social network filtering, machine translation, bioinformatics, drug design, medical image analysis, material inspection and board game programs, where they have produced results comparable to and in some cases surpassing human expert performance.[4][5][6] Artificial neural networks (ANNs) were inspired by information processing and distributed communication nodes in biological systems. ANNs have various differences from biological brains. Specifically, neural networks 

In [11]:
from nltk import word_tokenize          
from nltk.stem import WordNetLemmatizer

import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
import numpy as np
import nltk
from nltk.corpus import stopwords

nltk.download('punkt')
stop_words = set(stopwords.words('english'))

def get_similarity_score(actual,query):
    class LemmaTokenizer:
        
   
        ignore_tokens = [',', '.', ';', ':', '"', '``', "''", '`']
        def __init__(self):
            self.wnl = WordNetLemmatizer()
        def __call__(self, doc):
            return [self.wnl.lemmatize(t) for t in word_tokenize(doc) if t not in self.ignore_tokens]

    tokenizer=LemmaTokenizer()
    
    tokens=str(actual)
    #tokenizer(tokens)
    
    documents=tokens
    search_terms = str(query)
    # search_terms = 'sewing machine'

    # Initialise TfidfVectorizer with the LemmaTokenizer. Also need to lemmatize the stop words as well
    token_stop = tokenizer(' '.join(stop_words))
    vectorizer = TfidfVectorizer(stop_words=token_stop, tokenizer=tokenizer)

    # Calculate the word frequency, and calculate the cosine similarity of the search terms to the documents

    
        
    vectors = vectorizer.fit_transform([search_terms] + [documents])
    cosine_similarities = linear_kernel(vectors[0:1], vectors).flatten()
    document_scores = [item.item() for item in cosine_similarities[1:]]  # convert back to native Python dtypes

    scores = [(score) for score in zip(document_scores)]
    perc_scores=scores[0][0]*100
    print(" Corupus Similarity Ratio:",round(perc_scores,2),"%")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Harsh\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [15]:
get_similarity_score(act,que)

 Corupus Similarity Ratio: 16.71 %


In [13]:
quer=que+'machine learning algorithms functions'

In [14]:
get_similarity_score(act,quer)

 Corupus Similarity Ratio: 18.61 %
