In [52]:
# !conda info || true
# !python --version
# # install and add to requirements.txt
# !touch requirements.txt
# !echo -n > requirements.txt

# # add dependency in requirements.txt
# !echo typing >> requirements.txt
# !echo pydantic >> requirements.txt
# !echo stringcase >> requirements.txt
# !echo contractions >> requirements.txt
# !echo sentence-transformers >> requirements.txt # https://sbert.net/docs/quickstart.html
# !echo langchain[all] >> requirements.txt # https://langchain.readthedocs.io/en/latest/getting_started/getting_started.html

# !time pip install -r requirements.txt
# !pip freeze > requirements.frozen.tmp.txt
# !cat requirements.frozen.tmp.txt | grep -v "file://" > requirements.frozen.txt 
# !rm -f requirements.frozen.tmp.txt


     active environment : None
       user config file : /home/jovyan/.condarc
 populated config files : /opt/conda/.condarc
          conda version : 22.11.1
    conda-build version : not installed
         python version : 3.10.8.final.0
       virtual packages : __archspec=1=x86_64
                          __glibc=2.35=0
                          __linux=5.15.49=0
                          __unix=0=0
       base environment : /opt/conda  (writable)
      conda av data dir : /opt/conda/etc/conda
  conda av metadata url : None
           channel URLs : https://conda.anaconda.org/conda-forge/linux-64
                          https://conda.anaconda.org/conda-forge/noarch
          package cache : /opt/conda/pkgs
                          /home/jovyan/.conda/pkgs
       envs directories : /opt/conda/envs
                          /home/jovyan/.conda/envs
               platform : linux-64
             user-agent : conda/22.11.1 requests/2.28.2 CPython/3.10.8 Linux/5.15.49-linuxkit ubu

In [2]:
# create directory structure
!mkdir -p data_processing
!touch data_processing/__init__.py

In [3]:
%%writefile data_processing/data_processing.py


def hello():
    pass

Overwriting data_processing/data_processing.py


In [53]:
%%writefile data_processing/tokanizer.py
import nltk
from typing import List, Union, NewType

nltk.download('punkt')

TextToken = NewType('TextToken', str)
TextTokenList = NewType('TextTokenList', List[TextToken])

class Tokanizer:

    @staticmethod
    def word_tokanize(data, preserve_line=False) -> List[str]:
        return nltk.tokenize.word_tokenize(data, preserve_line=preserve_line)
    
    @staticmethod
    def sentance_tokenize(data) -> List[str]:
        tokens = nltk.sent_tokenize(data)
        newline_stripped_tokens = [x.replace('\n',' ') for x in tokens]
        return [" ".join(x.split()) for x in tokens]

Overwriting data_processing/tokanizer.py


In [49]:
%%writefile data_processing/case_converter.py
from typing import List, Union
import stringcase


class CaseConverter:

    @staticmethod
    def to_lowercase(data: Union[List[str], str]) -> Union[List[str], str]:
        if isinstance(data, list):
            return [stringcase.lowercase(x) for x in data]
        
        return stringcase.lowercase(data)
    
    @staticmethod
    def to_uppercase(data: Union[List[str], str]) -> Union[List[str], str]:
        if isinstance(data, list):
            return [stringcase.uppercase(x) for x in data]
        
        return stringcase.uppercase(data)


Writing data_processing/case_converter.py


In [49]:
%%writefile data_processing/stop_word_remover.py
from typing import List, Union
import nltk
stopword_list = nltk.corpus.stopwords.words(‘english’)
stopword_list.remove(‘no’)
stopword_list.remove(‘not’)

class StopWordRemover:

    @staticmethod
    def strip(data: Union[List[str], str]) -> Union[List[str], str]:
        if isinstance(data, list):
            return [stringcase.lowercase(x) for x in data]
        
        return stringcase.lowercase(data)


Writing data_processing/case_converter.py


In [57]:
%reload_ext autoreload
%autoreload 2
from data_processing.data_processing import hello
from data_processing.tokanizer import Tokanizer, TextToken, TextTokenList
from data_processing.case_converter import CaseConverter

from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

#Our sentences we like to encode
sentence_text = 'This page documents the properties and methods when you load a SentenceTransformer model'

sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.'
]


multiline_text ='''Good muffins cost $3.88\nin New York.  Please buy me 
two of them.\n\nThanks.'''

multiline_data = [
    '''Good muffins cost $3.88\nin New York.  Please buy me 
    two of them.\n\nThanks.''',
    
    '''This method allows to run encode() on multiple GPUs . 
    The sentences are chunked into smaller   packages \n\nand sent to individual processes , 
    which encode these on the different GPUs. 
    This method is only suitable for encoding large sets of sentences .
    '''
]


#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)

print("===== Word Tokens")
print("Word Tokens:", Tokanizer.word_tokanize(sentences[0]))
print("Lowercase Word Tokens:", CaseConverter.to_lowercase(Tokanizer.word_tokanize(sentences[0])))
print("Word Tokens:", Tokanizer.word_tokanize(multiline_text, preserve_line=True))
print("Lowercase Word Tokens:", CaseConverter.to_lowercase(Tokanizer.word_tokanize(multiline_text, preserve_line=True)))

print("")

print("===== Sentence Tokens")
print("Sentence Tokens:", Tokanizer.sentance_tokenize(sentences[0]))
print("Sentence Tokens:", Tokanizer.sentance_tokenize(multiline_text))
print("")

print("===== Data Tokens")
for sentence in multiline_data:
    print("Word Tokens:", Tokanizer.word_tokanize(sentence))
    print("Sentence Tokens:", Tokanizer.sentance_tokenize(sentence))
    print("")

print("===== Sentence Embeddings")
for sentence, embedding in zip(sentences, sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", len(embedding))
    print("Tokens:", Tokanizer.word_tokanize(sentence))
    print("Tokens:", Tokanizer.sentance_tokenize(sentence))
    print("")
    
a = TextToken("hello")
b = TextTokenList(["hello", "there", 1])
print(type(a), type(b))

===== Word Tokens
Word Tokens: ['This', 'framework', 'generates', 'embeddings', 'for', 'each', 'input', 'sentence']
Lowercase Word Tokens: ['this', 'framework', 'generates', 'embeddings', 'for', 'each', 'input', 'sentence']
Word Tokens: ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York.', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']
Lowercase Word Tokens: ['good', 'muffins', 'cost', '$', '3.88', 'in', 'new', 'york.', 'please', 'buy', 'me', 'two', 'of', 'them.', 'thanks', '.']

===== Sentence Tokens
Sentence Tokens: ['This framework generates embeddings for each input sentence']
Sentence Tokens: ['Good muffins cost $3.88 in New York.', 'Please buy me two of them.', 'Thanks.']

===== Data Tokens
Word Tokens: ['Good', 'muffins', 'cost', '$', '3.88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']
Sentence Tokens: ['Good muffins cost $3.88 in New York.', 'Please buy me two of them.', 'Thanks.']

Word Tokens: ['This', 'metho

In [6]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')

#Sentences are encoded by calling model.encode()
emb1 = model.encode("This is a red cat with a hat.")
emb2 = model.encode("Have you seen my red cat?")

cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: tensor([[0.6153]])
