In [1]:
import numpy as np
import ctypes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.spatial.distance import hamming, euclidean
from scipy.stats import pearsonr

In [2]:
#Data Cleaning for extraction of meaningful words
#Using TF-IDF over Count Vectorizer because it also takes into account the importance of words

def extract_features(documents):

    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(documents)
    features = vectorizer.get_feature_names_out()
    
    print(features, "\n")
    
    return X.toarray(), features

In [3]:
#Calculating simhash fingerprints

def simhash_fingerprint(features, weights, f=64):
    V = np.zeros(f)
    for feature, weight in zip(features, weights):
        #Generating 64-bit fingerprints
        h = ctypes.c_size_t(hash(feature)).value 
        b = bin(h)[2:] 
        z = str(b)

        #Ensuring lenghth of every fingerprint is 64 bit
        if len(z) != 64:
            for i in range(64):
                z = '0' + z
                if len(z) == 64: break

        #Multiplying each bit by its positive weight if it is 1 and negative weight if it is 0
        for i in range(f):
            if z[i] == '1':
                V[i] += weight
            elif z[i] == '0':
                V[i] -= weight
                
    #Generating the final simhash fingerprint for every component in the extracted corpus
    fingerprint = ''.join('1' if v >= 0 else '0' for v in V)
    return fingerprint

In [4]:
def hamming_distance(fingerprint1, fingerprint2):
    return round((hamming(list(fingerprint1), list(fingerprint2))), 3)

def euclidean_distance(counts1, counts2):
    return round((euclidean(counts1, counts2)), 3)

def compute_pearson_correlation(distances1, distances2):
    return round((pearsonr(distances1, distances2)[0]), 3)

In [5]:
if __name__ == "__main__":

    #Using the corpus from the sample code provided in the question for testing
    #corpus: 15 tweets using Omicron as the query
    
    documents = ["DrCChambers RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19,\
    CIHR is extending the registration and application de…",
    "sealsoftheend Japan's Kowa says Ivermectin showed 'antiviral effect' against Omicron https://t.co/mKKY24WeQV",
    "SVictor70973566 RT @EricTopol: Why is Omicron so hyper-transmissible? It's not related to high viral load in\
    the upper airway, as shown by 2 recent studies…",
    "freethinkfacts RT @yaneerbaryam: Actual cases of reinfection by Omicron are so widespread they are manifest to\
    anyone who is not closing their eyes: 10/…",
    "lsoril RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending\
    the registration and application de…",
    "pompey1977 RT @AllisonPearson: How can people not get it? Omicron’s advantage over Delta is it evades the vaccine.\
    Everyone is going to get Omicron…",
    "freethinkfacts RT @yaneerbaryam: Taken together, our results suggest that Omicron-induced immunity may not be\
    sufficient to prevent infection from anothe…",
    "SteveBennett15 RT @EricTopol: Anyone who thinks that vaccines aren't working against Omicron might want to review\
    the data https://t.co/9bHYdKxz8u https:/…",
    "wasohope RT @ASTERHealthcare: Omicron covid-19 variant was reported from South Africa on November 2021.\
    This variant has had many mutations that aff…",
    "SVictor70973566 RT @MdFacep: @EricTopol @maxdkozlov Omicron's impact is in its ability to evade our immune system:\
    Our 'older' vaccine produced NABS fail…",
    "Wildantlers @melulater Of course it did, Omicron spreads far quicker. But as a % of people who die from omicron\
    it is far milde… https://t.co/eDFuW5qjAH",
    "peterandann RT @EnemyInAState: Omicron, London: Babies and toddlers continue to surge, and 1 in 9 admitted\
    is a child. Over 668 babies and toddlers hav…",
    "DrMroz RT @CIHR_IRSC: In light of the disruptions caused by the Omicron variant of #Covid19, CIHR is extending\
    the registration and application de…",
    "Deis85208721 CORRECTED-Japan's Kowa says ivermectin showed 'antiviral effect' against Omicron in research\
    https://t.co/VEoQyz5x6F",
    "freethinkfacts RT @yaneerbaryam: Thus, breakthrough infection from Omicron may enhance cross-protection\
    against Delta, and vice-versa, [only] inasmuch as…"]

    
    counts, features = extract_features(documents)
    
    fingerprints = [simhash_fingerprint(features, counts[i]) for i in range(len(documents))]

    #Calculating hamming and euclidean distances between every element in the extracted corpus
    hamming_distances = []
    euclidean_distances = []
    for i in range(len(documents)):
        for j in range(i+1, len(documents)):
            hamming_dist = hamming_distance(fingerprints[i], fingerprints[j])
            euclidean_dist = euclidean_distance(counts[i], counts[j])
            hamming_distances.append(hamming_dist)
            euclidean_distances.append(euclidean_dist)
    
    print("Hamming Distances: \n", hamming_distances, '\n')
    print("Euclidean Distances: \n", euclidean_distances, '\n')
    
    correlation = compute_pearson_correlation(hamming_distances, euclidean_distances)
    print("Pearson Correlation:", correlation)
    

['10' '19' '2021' '668' '9bhydkxz8u' 'ability' 'actual' 'admitted'
 'advantage' 'aff' 'africa' 'airway' 'allisonpearson' 'anothe' 'antiviral'
 'application' 'aren' 'asterhealthcare' 'babies' 'breakthrough' 'cases'
 'caused' 'child' 'cihr' 'cihr_irsc' 'closing' 'continue' 'corrected'
 'course' 'covid' 'covid19' 'cross' 'data' 'deis85208721' 'delta' 'did'
 'die' 'disruptions' 'drcchambers' 'drmroz' 'edfuw5qjah' 'effect'
 'enemyinastate' 'enhance' 'erictopol' 'evade' 'evades' 'extending' 'eyes'
 'fail' 'far' 'freethinkfacts' 'going' 'hav' 'high' 'https' 'hyper'
 'immune' 'immunity' 'impact' 'inasmuch' 'induced' 'infection'
 'ivermectin' 'japan' 'kowa' 'light' 'load' 'london' 'lsoril' 'manifest'
 'maxdkozlov' 'mdfacep' 'melulater' 'milde' 'mkky24weqv' 'mutations'
 'nabs' 'november' 'older' 'omicron' 'people' 'peterandann' 'pompey1977'
 'prevent' 'produced' 'protection' 'quicker' 'recent' 'registration'
 'reinfection' 'related' 'reported' 'research' 'results' 'review' 'rt'
 'says' 'sealsoft