In [None]:
//#r "nuget: Catalyst.Spacy, 1.0.31087"
#r "nuget: Catalyst, 1.0.31087"
#r "nuget: Catalyst.Models.English, 1.0.30952"

In [None]:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Catalyst;
using Catalyst.Models;
using Version = Mosaik.Core.Version;
using Mosaik.Core;
using P = Catalyst.PatternUnitPrototype;
using System.Globalization;
using System.Text.RegularExpressions;

In [None]:
Catalyst.Models.English.Register(); //You need to pre-register each language (and install the respective NuGet Packages)

In [None]:
public string ToTitleCase(string text)
{
    TextInfo myTI = new CultureInfo("en-US", false).TextInfo;
    return myTI.ToTitleCase(text.ToLower());
}

public string AllCapsToTitleCase(string text) {
    string pattern = @"\b[A-Z]{2,}\b";

    var res = Regex.Replace(text, pattern, match => ToTitleCase(match.Value));

    return res;
}

private async Task<List<string>> GetCharacterNames(string text, List<string> knownCharacterNames) {
    text = AllCapsToTitleCase(text);
    var nlp = await Pipeline.ForAsync(Language.English);
    nlp.Add(await AveragePerceptronEntityRecognizer.FromStoreAsync(language: Language.English, version: Version.Latest, tag: "WikiNER"));

    var neuralizer = new Neuralyzer(Language.English, 0, "WikiNER-sample-fixes");

    var spotter = new Spotter(Language.Any, 0, "character name", "Person");

    foreach(var knownCharacter in knownCharacterNames) {
        //Teach the Neuralyzer class to add the entity type Person for a match for the single token "knownCharacter"
        neuralizer.TeachAddPattern("Person", knownCharacter, mp => mp.Add(new PatternUnit(P.Single().WithToken(knownCharacter))));
        neuralizer.TeachAddPattern("Person", knownCharacter.ToUpper(), mp => mp.Add(new PatternUnit(P.Single().WithToken(knownCharacter.ToUpper()))));
        spotter.AddEntry(knownCharacter);
        spotter.AddEntry(knownCharacter.ToUpper());
    }

    nlp.UseNeuralyzer(neuralizer);

    nlp.Add(spotter);

    var doc = new Document(text, Language.English);

    nlp.ProcessSingle(doc);

    var results = doc.SelectMany(span => span.GetEntities()).Where(e => e.EntityType.Type == "Person").Select(e => $"{e.Value}").Distinct().ToList();

    return results;
}

private async Task<(string, string, Dictionary<string, int>)> AnonymizeCharacters(string originalFull, string originalSummary, List<string> knownCharacterNames) {
    var detectedNames = await GetCharacterNames(originalFull + "\n\n" + originalSummary, knownCharacterNames);
    var detectedNamesLower = detectedNames.Select(n => n.ToLower()).ToList();
    //Console.WriteLine("DETECTED: " + string.Join(", ", detectedNames));

    var namesToIndex = new Dictionary<string, int>();

    // build a list of all possible ways the same character might be referenced, and assign a character index to keep track. For example:
    // Tony = 0
    // Stark = 0
    // Tony Stark = 0

    var curNameNum = 0;

    foreach(var name in knownCharacterNames) {
        namesToIndex.Add(name, curNameNum);

        var nameParts = name.Split(' ').ToList();

        foreach(var part in nameParts) {
            if (namesToIndex.ContainsKey(ToTitleCase(part)) == false) {
                namesToIndex.Add(ToTitleCase(part), curNameNum);
            }

            if (namesToIndex.ContainsKey(part.ToUpper()) == false) {
                namesToIndex.Add(part.ToUpper(), curNameNum);
            }
            
            if (detectedNamesLower.Contains(part.ToLower())) {
                var detectedName = detectedNamesLower.Where(n => n == part.ToLower()).First();

                if (namesToIndex.ContainsKey(ToTitleCase(detectedName)) == false) {
                    namesToIndex.Add(ToTitleCase(detectedName), curNameNum);
                }

                if (namesToIndex.ContainsKey(detectedName.ToUpper()) == false) {
                    namesToIndex.Add(detectedName.ToUpper(), curNameNum);
                }
            }
        }

        curNameNum += 1;
    }

    foreach(var name in detectedNames) {
        if (namesToIndex.ContainsKey(name) == false) {
            namesToIndex.Add(name, curNameNum);

            if (namesToIndex.ContainsKey(ToTitleCase(name)) == false) {
                namesToIndex.Add(ToTitleCase(name), curNameNum);
            }

            if (namesToIndex.ContainsKey(name.ToUpper()) == false) {
                namesToIndex.Add(name.ToUpper(), curNameNum);
            }

            curNameNum += 1;
        }
    }
    
    // print final lookup dictionary
    //namesToIndex.Select(i => $"{i.Key}: {i.Value}").ToList().ForEach(Console.WriteLine);

    var separator = " ===SG=== ";

    var result = originalFull + separator + originalSummary;

    foreach(var name in namesToIndex.Keys.OrderByDescending(x => x.Length)) {
        //Console.WriteLine($"replacing: {name}");
        result = result.Replace(name, $"CHARACTER{namesToIndex[name]}");
        result = result.Replace(name.ToUpper(), $"CHARACTER{namesToIndex[name]}");
    }

    var resultParts = result.Split(separator);

    return (resultParts[0], resultParts[1], namesToIndex);
}

private string DeAnonymize(string text, Dictionary<string, int> namesToIndex, bool useShortestName) {
    // given a string like "CHARACTER0 talks with CHARACTER1" we use namesToIndex to replace them with the shortest real names found

    for(var i=0; i < 10; i++) { // assume max of 10 possible characters
        var possibleNames = namesToIndex.Where(kvp => kvp.Value == i).ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
        //possibleNames.Select(i => $"{i.Key}: {i.Value}").ToList().ForEach(Console.WriteLine);

        var possibleNamesList = possibleNames.Select(kvp => kvp.Key);
        
        if (useShortestName) {
            possibleNamesList = possibleNamesList.OrderBy(x => x.Length);
        } else {
            possibleNamesList = possibleNamesList.OrderByDescending(x => x.Length);
        }
        
        var shortestName = possibleNamesList.ToList().FirstOrDefault();

        if (shortestName != null) {
            text = text.Replace($"CHARACTER{i}", shortestName);
        }
    }

    return text;
}

In [None]:
var test1 = "EDWARD spoke to Will in his Paris apartment, after going to Pizza Hut for dinner with Mary and SPIDER MAN. Later, Mary and WILL go dancing.";
var test2 = "Tony and Stane fight as Spider Man and another man look on. STARK grabs OBADIAH and SPIDER MAN and the other man are like whoa.";
var test3 = "Tony Stark and Obadiah Stane fight.";
//List<string> knownCharacterNames = new(){ "Edward", "Will", "Mary", "Spider Man" }; // optional list we can get from the Character list. Names like "Will" and "Spider Man" don't match on their own
List<string> knownCharacterNames = new(){ "Tony Stark", "Obadiah Stane", "Spider Man" };
//List<string> knownCharacterNames = new(){ "Tony", "Obadiah", "Spider Man" };
//List<string> knownCharacterNames = new();

var (anonymizedFullText, anonymizedSummaryText, namesToIndex) = await AnonymizeCharacters(test2, "summary goes here", knownCharacterNames);

anonymizedFullText

CHARACTER0 and CHARACTER1 fight as CHARACTER2 and another man look on. CHARACTER0 grabs CHARACTER1 and CHARACTER2 and the other man are like whoa.

In [None]:
var reverseTest1 = "CHARACTER0 and CHARACTER1 fight as CHARACTER2 and another man look on. CHARACTER0 grabs CHARACTER1 and CHARACTER2 and the other man are like whoa.";

var deAnonTest = DeAnonymize(reverseTest1, namesToIndex, false);
deAnonTest

Tony Stark and Obadiah Stane fight as Spider Man and another man look on. Tony Stark grabs Obadiah Stane and Spider Man and the other man are like whoa.

In [None]:
var realFull1 = @"
EXT. FIELD - PRE-DAWN

Exhausted from playing fetch all night, Edward throws the stick into the woods. The still-spry dog goes after it. It's gone for a long time, long enough that Edward becomes concerned.

He follows it into the woods.

INT. WOODS - DAWN

Amos Calloway stands up behind a bush, buck naked and hairy. He still has the stick in his mouth, which he takes out as Edward approaches.

AMOS
Didn't kill anything, did I?

EDWARD
A few rabbits, but I think one of them was already dead.

AMOS
That would explain the indigestion.

Edward tosses him his jacket to cover his privates.

AMOS (CONT'D)
I was wrong about you kid. You may not have much, but what you got, you got a lot of. You could get any girl.

EDWARD
There's only one I want.

A beat.

AMOS
Her name is Sandra Templeton. She's going to Auburn. The semester's almost over, so you better hurry.

EDWARD
Thank you.

AMOS
Good luck, kid.

Edward walks away. Then starts running. He has to get there as soon as possible.

Amos sits down and scratches his ear with his foot.

EXT. BIG TOP - DAY 

Edward shakes Karl's giant hand. They hug.

EDWARD (V.O.)
After saying my goodbyes, I hopped three trains to get to Auburn that afternoon.

";

var realSummary1 = "Amos turns back into a human, and thanks Edward, telling him the girl's name is Sandra Templeton, and she's going to Auburn. Edward bids farewell to Karl and heads to Auburn.";

List<string> knownCharacterNames = new(){"Edward Hill", "Amos Calloway", "Sandra Templeton"};

var (anonymizedFullText, anonymizedSummaryText, namesToIndex) = await AnonymizeCharacters(realFull1, realSummary1, knownCharacterNames);

anonymizedSummaryText

CHARACTER1 turns back into a human, and thanks CHARACTER0, telling him the girl's name is CHARACTER2, and she's going to Auburn. CHARACTER0 bids farewell to CHARACTER4 and heads to Auburn.

In [None]:
var deAnonTest = DeAnonymize(anonymizedSummaryText, namesToIndex, false);
deAnonTest

Amos Calloway turns back into a human, and thanks Edward Hill, telling him the girl's name is Sandra Templeton, and she's going to Auburn. Edward Hill bids farewell to Karl and heads to Auburn.