In [14]:
import fasttext
import re
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import unicodedata
from emoji import demojize
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
def text_cleaning(text_data, stop_words, lemmatizer):
    text_data = unicodedata.normalize('NFKD', text_data).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    text_data = text_data.lower()
    text_data = demojize(text_data)
    pattern_punct = re.compile(r'([.,/#!$%^&*?;:{}=_`~()+-])\1{1,}')
    text_data = pattern_punct.sub(r'\1', text_data)
    text_data = re.sub(' {2,}',' ', text_data)
    text_data = re.sub(r"[^a-zA-Z?!]+", ' ', text_data)
    text_data = str(text_data)
    tokenizer = ToktokTokenizer()
    text_data = tokenizer.tokenize(text_data)
    text_data = [item for item in text_data if item not in stop_words]
    text_data = [lemmatizer.lemmatize(word = w, pos = 'v') for w in text_data]
    text_data = ' '.join (text_data)
    return text_data

In [7]:
def load_shit():
    #nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    lemmatizer = nltk.stem.WordNetLemmatizer()
    model = fasttext.load_model('C://GitHub Repos//FYP-Chips//NSFW Text//English//profanity_model_eng.bin')
    #nltk.download('wordnet')
    return stop_words, lemmatizer, model

In [27]:
def check_duplicity(user_input1, user_input2, stop_words, lemmatizer, model):
    print("\nYou entered:", user_input1)
    print("\nYou entered:", user_input2)
    user_input1 = text_cleaning(user_input1, stop_words, lemmatizer)
    user_input2 = text_cleaning(user_input2, stop_words, lemmatizer)
    user_input1 = user_input1.split()
    user_input2 = user_input2.split()
    embeddings1 = []
    embeddings2 = []
    for i in range(len(user_input1)):
        embeddings1.append(model.get_word_vector(user_input1[i]))
    for i in range(len(user_input2)):
        embeddings2.append(model.get_word_vector(user_input2[i]))
    embeddings1 = np.mean(embeddings1, axis=0)
    embeddings2 = np.mean(embeddings2, axis=0)
    similarity_score = cosine_similarity([embeddings1], [embeddings2])[0][0]
    percentage_score = (similarity_score + 1) * 50
    print("Similarity Score: " + str(percentage_score) + "%")

In [9]:
stop_words, lemmatizer, model = load_shit()

In [28]:
check_duplicity( "The quick brown fox jumps over the lazy dog.",  "The quick brown fox jumps over the lazy dog.", stop_words, lemmatizer, model)


You entered: The quick brown fox jumps over the lazy dog.

You entered: The quick brown fox jumps over the lazy dog.
Similarity Score: 100.0%


In [29]:
check_duplicity( "what da dog doin.",  "nigar faggot.", stop_words, lemmatizer, model)


You entered: what da dog doin.

You entered: nigar faggot.
Similarity Score: 0.3382444381713867%


In [30]:
# Dummy Job Posting Description 1
job_posting1 = "We are seeking a software engineer to join our innovative team. The ideal candidate should have strong programming skills in Python and experience with machine learning. You will work on cutting-edge projects and collaborate with a talented group of professionals."

# Paraphrase of Job Posting Description 1
job_posting1_paraphrase = "We are looking for a software engineer to join our creative team. The perfect candidate should possess excellent Python programming skills and a background in machine learning. You will be involved in state-of-the-art initiatives and partner with a skilled team of experts."

# Dummy Job Posting Description 2 (Similar to 1)
job_posting2 = "Our team is in search of a software engineer to become a part of our forward-thinking group. We are looking for a candidate with strong Python programming skills and experience in machine learning. You will have the opportunity to work on groundbreaking projects and collaborate with a team of talented professionals."

# Dummy Job Posting Description 3 (Different)
job_posting3 = "We are seeking a graphic designer to join our creative department. The ideal candidate should have a strong portfolio of design work and experience with Adobe Creative Suite. You will be responsible for creating visually appealing graphics and collaborating with our design team."

# Testing duplicity calculator
check_duplicity(job_posting1, job_posting2, stop_words, lemmatizer, model)
check_duplicity(job_posting1, job_posting3, stop_words, lemmatizer, model)


You entered: We are seeking a software engineer to join our innovative team. The ideal candidate should have strong programming skills in Python and experience with machine learning. You will work on cutting-edge projects and collaborate with a talented group of professionals.

You entered: Our team is in search of a software engineer to become a part of our forward-thinking group. We are looking for a candidate with strong Python programming skills and experience in machine learning. You will have the opportunity to work on groundbreaking projects and collaborate with a team of talented professionals.
Similarity Score: 99.95861947536469%

You entered: We are seeking a software engineer to join our innovative team. The ideal candidate should have strong programming skills in Python and experience with machine learning. You will work on cutting-edge projects and collaborate with a talented group of professionals.

You entered: We are seeking a graphic designer to join our creative depa

In [31]:
# Dummy Job Posting Description 1
job_posting1 = "We are looking for a front-end web developer to join our web development team. The ideal candidate should have expertise in HTML, CSS, and JavaScript. You will be responsible for creating responsive and visually appealing websites for our clients."

# Dummy Job Posting Description 2 (Related but Different)
job_posting2 = "Our company is seeking a back-end software engineer to become part of our software development team. The perfect candidate should have a strong background in Python and experience with database management. You will work on optimizing server performance and collaborating with our software developers."

# Testing duplicity calculator
check_duplicity(job_posting1, job_posting2, stop_words, lemmatizer, model)



You entered: We are looking for a front-end web developer to join our web development team. The ideal candidate should have expertise in HTML, CSS, and JavaScript. You will be responsible for creating responsive and visually appealing websites for our clients.

You entered: Our company is seeking a back-end software engineer to become part of our software development team. The perfect candidate should have a strong background in Python and experience with database management. You will work on optimizing server performance and collaborating with our software developers.
Similarity Score: 99.9157965183258%
