In [1]:
!pip install textstat
!pip install bert_score

Collecting textstat
  Downloading textstat-0.7.5-py3-none-any.whl.metadata (15 kB)
Collecting pyphen (from textstat)
  Downloading pyphen-0.17.2-py3-none-any.whl.metadata (3.2 kB)
Collecting cmudict (from textstat)
  Downloading cmudict-1.0.32-py3-none-any.whl.metadata (3.6 kB)
Downloading textstat-0.7.5-py3-none-any.whl (105 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m105.3/105.3 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading cmudict-1.0.32-py3-none-any.whl (939 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m939.4/939.4 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyphen-0.17.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m17.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyphen, cmudict, textstat
Successfully installed cmudict-1.0.32 pyphen-0.17.2 textstat-0.7.5


In [24]:
import nltk
nltk.download('punkt_tab')
import json
import spacy
import textstat

def unique_tokens(text, tokenizer=nltk.TweetTokenizer().tokenize):
    """
    Tokenizes the input text and returns the set of unique tokens.
    """
    return set(tokenizer(text))

def calculate_metrics(text):
    """
    Calculates WC (Word Count), TTR (Type-Token Ratio),
    ADD (Average Dependency Distance), and FKG (Flesch-Kincaid Grade Level).
    """
    nlp = spacy.load("en_core_web_sm")

    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    word_count = len(tokens)
    unique_word_count = len(set(token.lower() for token in tokens))
    ttr = unique_word_count / word_count if word_count > 0 else 0

    # Dependency parsing
    doc = nlp(text)
    dependencies = [abs(token.i - token.head.i) for token in doc if token.head.i != token.i]
    add = sum(dependencies) / len(dependencies) if dependencies else 0

    # Readability score
    fkg = textstat.flesch_kincaid_grade(text)

    return {"WC": word_count, "TTR": ttr, "ADD": add, "FKG": fkg}

# Example usage
if __name__ == "__main__":
    text = "To make the Physical Internet work worldwide, it's important to set global standards and build the needed infrastructure. Governments should support the use of digital and automation technologies, working with the logistics industry to create efficient networks. At the same time, policies that focus on the environment and sustainability should be put in place, along with rewards to encourage businesses to invest."
    print("Unique Tokens:", unique_tokens(text))
    print("Metrics:", calculate_metrics(text))

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Unique Tokens: {'working', 'efficient', ',', 'policies', 'At', 'To', 'Physical', 'Governments', 'make', 'same', 'the', 'with', 'needed', 'digital', 'networks', "it's", 'automation', 'environment', 'businesses', 'on', 'worldwide', 'build', 'create', 'work', 'be', 'should', 'standards', 'global', 'that', 'infrastructure', 'support', 'put', 'time', 'in', 'industry', 'focus', '.', 'along', 'important', 'of', 'to', 'set', 'sustainability', 'rewards', 'use', 'invest', 'technologies', 'Internet', 'encourage', 'and', 'place', 'logistics'}
Metrics: {'WC': 70, 'TTR': 0.7428571428571429, 'ADD': 3.1044776119402986, 'FKG': 13.7}


In [None]:
# Cosine Similarity
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Text
sentence1 = "While the ALICE and Japan roadmaps lay a solid foundation for the Physical Internet (PI), several research gaps remain. Key areas include the development of universal standards for interoperability, secure data-sharing mechanisms, and more robust sustainability metrics. There is also limited research on decentralized logistics models, which could improve resilience. Emerging technologies like IoT, AI, and blockchain need further exploration for integration into the PI framework. Additionally, workforce impacts, including job displacement and retraining, are insufficiently addressed. Lastly, economic models and funding mechanisms for PI adoption require more in-depth study."
sentence2 = "The areas lacking in research focusing on the physical internet, compared to the ALICE and Japan roadmaps, include insufficient exploration of container development, hub inventory management, and the integration of Internet of Things (IoT) technologies in risk management within prefabricated construction. Additionally, there is a need for deeper analysis of protocols and methods for resilience and efficiency in transport concepts, as well as a more comprehensive understanding of the functionalities and attributes of road-based physical internet systems."

embeddings1 = model.encode(sentence1, convert_to_tensor=True)
embeddings2 = model.encode(sentence2, convert_to_tensor=True)

cosine_score = util.pytorch_cos_sim(embeddings1, embeddings2)[0][0]

print(f"Similarity between sentence 1 and sentence 2: {cosine_score}")

Similarity between sentence 1 and sentence 2: 0.6708195209503174


In [34]:
# BERTScore & SentenceBERT
from sentence_transformers import SentenceTransformer
from bert_score import score
from sklearn.metrics.pairwise import cosine_similarity

# Text
sentence1 = "The Physical Internet (PI) is a global system to make logistics work like the digital internet. However, more research is needed in areas like standardization, sustainability, security, and using drones and autonomous vehicles. It's also important to study how it will affect society, the economy, and what rules are needed to make it work."
sentence2 = "The areas of research that are missing in the context of the Physical Internet include a comprehensive exploration of the strengths, risks, challenges, and potential barriers to implementation. Additionally, there is a lack of studies focusing on the integration of hyperconnected logistics networks, the operationalization of the Physical Internet using frameworks similar to the digital internet, and the development of robust business models to support its adoption. Further research is also needed on the stochasticity and resilience of hub location and network design in large-scale optimization problems."

P, R, F1 = score([sentence1], [sentence2], lang='en')

print(f"BERTScore Precision: {P[0]:.4f}")
print(f"BERTScore Recall: {R[0]:.4f}")
print(f"BERTScore F1: {F1[0]:.4f}")

model_sentenceBERT = SentenceTransformer('all-MiniLM-L6-v2')

embedding1_sentenceBERT = model_sentenceBERT.encode(sentence1)
embedding2_sentenceBERT = model_sentenceBERT.encode(sentence2)

cosine_sim_sentenceBERT = cosine_similarity([embedding1_sentenceBERT], [embedding2_sentenceBERT])[0][0]

print(f"Cosine Similarity (SentenceBERT): {cosine_sim_sentenceBERT:.4f}")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore Precision: 0.8842
BERTScore Recall: 0.8681
BERTScore F1: 0.8761
Cosine Similarity (SentenceBERT): 0.5252
