## Finding Semantic Similarity

In [1]:
%%loadFromPOM
<!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text -->
<dependency>
    <groupId>org.apache.commons</groupId>
    <artifactId>commons-text</artifactId>
    <version>1.6</version>
</dependency>

In [2]:
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.commons.text.similarity.CosineSimilarity;

In [4]:
CosineSimilarity cosineSimilarity = new CosineSimilarity();
String firstSample = "A simple sentence";
String secondSample = "One simple sentence";

// Create sentence vectors with lambda expression
Map<CharSequence, Integer> vectorA = Arrays.stream(firstSample.split("")).collect(
                                        Collectors.toMap(character -> character,
                                                        character -> 1, 
                                                        Integer::sum));
Map<CharSequence, Integer> vectorB = Arrays.stream(secondSample.split("")).collect(
                                        Collectors.toMap(character -> character,
                                                        character -> 1,
                                                        Integer::sum));
// Print the cosine similarity
System.out.printf("%5.4f\n", cosineSimilarity.cosineSimilarity(vectorA, vectorB));

0.9659


java.io.PrintStream@6fb30523

In [5]:
// Without lambda 
HashMap<CharSequence, Integer> vectorC = new HashMap<>();
for (char character : secondSample.toCharArray()) {
    int count = (vectorC.get(character + "") == null) ? 0 : vectorC.get(character + "");
    vectorC.put(character + "", count + 1);
}

System.out.printf("%5.4f\n",cosineSimilarity.cosineSimilarity(vectorA, vectorC));


0.9659


java.io.PrintStream@6fb30523

In [7]:
// Split the strings into words
vectorA = Arrays.stream(firstSample.split(" ")).collect(Collectors.toMap(word -> word,
                                                                        word -> 1, 
                                                                        Integer::sum));
vectorB = Arrays.stream(secondSample.split(" ")).collect(Collectors.toMap(word -> word,
                                                                        word -> 1, 
                                                                        Integer::sum));

System.out.printf("%5.4f\n",cosineSimilarity.cosineSimilarity(vectorA, vectorB));

0.6667


java.io.PrintStream@6fb30523

In [13]:
void calculateTextSimilarity(String phrase1, String phrase2) {
    Map<CharSequence, Integer> vectorA = Arrays.stream(phrase1.split("")).collect(
                                        Collectors.toMap(word -> word,
                                                        word -> 1, 
                                                        Integer::sum));
    Map<CharSequence, Integer> vectorB = Arrays.stream(phrase2.split("")).collect(
                                        Collectors.toMap(word -> word,
                                                        word -> 1,
                                                        Integer::sum));
    System.out.printf("%5.4f\n",cosineSimilarity.cosineSimilarity(vectorA, vectorB));
}

In [14]:
String first = "I kid you not this is amazing, right?";
String second = "I can understand this is awesome!";
String third = "Is this outright not nice?";
calculateTextSimilarity(first, second);
calculateTextSimilarity(first, third);
calculateTextSimilarity(second, third);

0.7938
0.8898
0.7817
