## Finding Semantic Similarity

In [None]:
%%loadFromPOM
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-text</artifactId>
    <version>1.6</version>
</dependency>

In [None]:
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.text.similarity.CosineSimilarity;

In [None]:
CosineSimilarity cosineSimilarity = new CosineSimilarity();
String firstSample = "A simple sentence";
String secondSample = "One simple sentence";

// Create sentence vectors with lambda expression
Map<CharSequence, Integer> vectorA = Arrays.stream(firstSample.split("")).collect(
                                        Collectors.toMap(character -> character,
                                                        character -> 1, 
                                                        Integer::sum));
Map<CharSequence, Integer> vectorB = Arrays.stream(secondSample.split("")).collect(
                                        Collectors.toMap(character -> character,
                                                        character -> 1,
                                                        Integer::sum));
// Print the cosine similarity
System.out.printf("%5.4f\n", cosineSimilarity.cosineSimilarity(vectorA, vectorB));

In [None]:
// Without lambda 
HashMap<CharSequence, Integer> vectorC = new HashMap<>();
for (char character : secondSample.toCharArray()) {
    int count = (vectorC.get(character + "") == null) ? 0 : vectorC.get(character + "");
    vectorC.put(character + "", count + 1);
}

System.out.printf("%5.4f\n",cosineSimilarity.cosineSimilarity(vectorA, vectorC));


In [None]:
// Split the strings into words
vectorA = Arrays.stream(firstSample.split(" ")).collect(Collectors.toMap(word -> word,
                                                                        word -> 1, 
                                                                        Integer::sum));
vectorB = Arrays.stream(secondSample.split(" ")).collect(Collectors.toMap(word -> word,
                                                                        word -> 1, 
                                                                        Integer::sum));

System.out.printf("%5.4f\n",cosineSimilarity.cosineSimilarity(vectorA, vectorB));

In [None]:
void calculateTextSimilarity(String phrase1, String phrase2) {
    Map<CharSequence, Integer> vectorA = Arrays.stream(phrase1.split("")).collect(
                                        Collectors.toMap(word -> word,
                                                        word -> 1, 
                                                        Integer::sum));
    Map<CharSequence, Integer> vectorB = Arrays.stream(phrase2.split("")).collect(
                                        Collectors.toMap(word -> word,
                                                        word -> 1,
                                                        Integer::sum));
    System.out.printf("%5.4f\n",cosineSimilarity.cosineSimilarity(vectorA, vectorB));
}

In [None]:
String first = "I kid you not this is amazing, right?";
String second = "I can understand this is awesome!";
String third = "Is this outright not nice?";
calculateTextSimilarity(first, second);
calculateTextSimilarity(first, third);
calculateTextSimilarity(second, third);

### Finding Distance between text 


In [None]:
%%loadFromPOM
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-text</artifactId>
    <version>1.6</version>
</dependency>

In [None]:
import org.apache.commons.text.similarity.HammingDistance;
import org.apache.commons.text.similarity.LevenshteinDetailedDistance;
import org.apache.commons.text.similarity.LevenshteinDistance;
import org.apache.commons.text.similarity.LevenshteinResults;

In [None]:
HammingDistance hammingDistance = new HammingDistance();
System.out.println("Hamming Distance: " + hammingDistance.apply("bat", "bat"));
System.out.println("Hamming Distance: " + hammingDistance.apply("bat", "cat"));
System.out.println("Hamming Distance: " + hammingDistance.apply("bat", "rut"));

In [None]:
LevenshteinDistance levenshteinDistance = new LevenshteinDistance();
System.out.println("Levenshtein Distance: " + levenshteinDistance.apply("bat", "bat"));
System.out.println("Levenshtein Distance: " + levenshteinDistance.apply("bat", "rat"));
System.out.println("Levenshtein Distance: " + levenshteinDistance.apply("bat", "rut"));
System.out.println("Levenshtein Distance: " + levenshteinDistance.apply("bat", "battle"));

In [None]:
LevenshteinDetailedDistance levenshteinDetailedDistance = new LevenshteinDetailedDistance();
LevenshteinResults levenshteinResults = levenshteinDetailedDistance.apply("similar", "simulator");
System.out.println("Number of deletions: " + levenshteinResults.getDeleteCount());
System.out.println("Number of insertions: " + levenshteinResults.getInsertCount());
System.out.println("Number of substitutions: "+ levenshteinResults.getSubstituteCount());