In [1]:
from main import PlagarismChecker

In [2]:
text = "Dogs are one of the most popular domestic pets and have been bred for thousands of years for various purposes, such as hunting, herding, and guarding. They belong to the Canidae family and are descendants of wolves. Today, dogs come in a wide range of breeds, each with its own unique physical appearance and personality traits. They are known for their loyalty, affection, and obedience to their owners. Dogs are also highly trainable and can be taught various commands and tricks, which makes them useful for various tasks, such as assistance for people with disabilities and law enforcement. They also make great companions for children and adults alike, providing comfort and emotional support."

In [3]:
# process all text
relevant_links = PlagarismChecker.google_search('dog', num_pages=3)

In [4]:
extracted_content = PlagarismChecker.extract_content(relevant_links)

ERROR:trafilatura.downloads:not a 200 response: 403 for URL https://www.dog.com/


In [6]:
import nltk
def get_part_of_speech(provided_word: str) -> str:
    _, part_of_speech = nltk.pos_tag([provided_word])[0]
    if 'NN' in part_of_speech:
        return 'n'
    if 'VB' in part_of_speech:
        return 'v'
    if 'JJ' in part_of_speech:
        return 'a'
    if 'RB' in part_of_speech:
        return 'r'
    return 'n'

In [7]:
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

def text_preparation(text: str):
    # tokenize, lowercase
    tokenized_content = [word.lower() for word in word_tokenize(text)]

    # removing stop words
    filtered_words = [word for word in tokenized_content if word not in stopwords.words('english')]

    # data cleaning (remove all characters that are not alphabets, <= 1 character strings)
    cleaned_content = [re.sub(r"[^A-Za-z]", "", word) for word in filtered_words if len(word) > 1]
    cleaned_content = [word for word in cleaned_content if len(word) > 1]

    # lemmatization
    words = [] # will contain duplicates of numbers
    lemmatizer = WordNetLemmatizer()
    for word in cleaned_content:
        words.append(lemmatizer.lemmatize(word, get_part_of_speech(word)))
    return words

measuring similarity

In [60]:
# from sklearn.feature_extraction.text import TfidfVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# # Initialize an instance of tf-idf Vectorizer
# tfidf_vectorizer = TfidfVectorizer()

# # Generate the tf-idf vectors for the corpus
# tfidf_matrix = tfidf_vectorizer.fit_transform(extracted_content)

# # compute and print the cosine similarity matrix
# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
# print(cosine_sim)

In [8]:
text_preparation(extracted_content[0])

['dog',
 'domesticate',
 'scientific',
 'classification',
 'kingdom',
 'animalia',
 'phylum',
 'chordata',
 'class',
 'mammalia',
 'order',
 'carnivora',
 'family',
 'canidae',
 'genus',
 'canis',
 'specie',
 'familiaris',
 'binomial',
 'name',
 'canis',
 'familiaris',
 'synonym',
 'list',
 'dog',
 'canis',
 'familiaris',
 'canis',
 'lupus',
 'familiaris',
 'domesticate',
 'descendant',
 'wolf',
 'also',
 'call',
 'domestic',
 'dog',
 'derive',
 'extinct',
 'pleistocene',
 'wolf',
 'modern',
 'wolf',
 'dog',
 'near',
 'living',
 'relative',
 'dog',
 'first',
 'specie',
 'domesticate',
 'huntergatherers',
 'year',
 'ago',
 'development',
 'agriculture',
 'due',
 'long',
 'association',
 'human',
 'dog',
 'expand',
 'large',
 'number',
 'domestic',
 'individual',
 'gain',
 'ability',
 'thrive',
 'starchrich',
 'diet',
 'would',
 'inadequate',
 'canid',
 'dog',
 'selectively',
 'breed',
 'millennium',
 'various',
 'behavior',
 'sensory',
 'capability',
 'physical',
 'attribute',
 'dog',
 

In [8]:
# jaccard similarity
def jaccard_similarity(text: str, compare_to: str):
    a, b = set(text), set(compare_to)
    intersection = a.intersection(b)
    union = a.union(b)
    return len(intersection)/len(union)

In [9]:
for article in extracted_content:
    print(jaccard_similarity(text_preparation(text), text_preparation(article)))

0.015078821110349555
0.058315334773218146
0.11180124223602485
0.007575757575757576
0.0136986301369863
0.0625
0.022916666666666665
0.007462686567164179
