In [None]:
#r "nuget: Catalyst, 1.0.31087"
#r "nuget: Catalyst.Models.English, 1.0.30952"

In [None]:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Catalyst;
using Catalyst.Models;
using Version = Mosaik.Core.Version;
using Mosaik.Core;
using P = Catalyst.PatternUnitPrototype;
using System.Globalization;
using System.Text.RegularExpressions;

In [None]:
Catalyst.Models.English.Register(); //You need to pre-register each language (and install the respective NuGet Packages)

In [None]:
public string ToTitleCase(string text)
{
    TextInfo myTI = new CultureInfo("en-US", false).TextInfo;
    return myTI.ToTitleCase(text.ToLower());
}

public string AllCapsToTitleCase(string text) {
    string pattern = @"\b[A-Z]{2,}\b";

    var res = Regex.Replace(text, pattern, match => ToTitleCase(match.Value));

    return res;
}

private async Task<List<string>> GetCharacterNames(string text, List<string> knownCharacterNames) {
    text = AllCapsToTitleCase(text);
    var nlp = await Pipeline.ForAsync(Language.English);
    nlp.RemoveAllNeuralizers();
    nlp.RemoveAll(p => true); // remove previously added entities
    nlp = await Pipeline.ForAsync(Language.English);
    nlp.Add(await AveragePerceptronEntityRecognizer.FromStoreAsync(language: Language.English, version: Version.Latest, tag: "WikiNER"));

    var neuralizer = new Neuralyzer(Language.English, 0, "WikiNER-sample-fixes");

    var spotter = new Spotter(Language.Any, 0, "character name", "Person");

    foreach(var knownCharacter in knownCharacterNames) {
        if (text.Contains(knownCharacter) == false && text.Contains(knownCharacter.ToUpper()) == false)
        {
            continue;
        }

        //Teach the Neuralyzer class to add the entity type Person for a match for the single token "knownCharacter"
        neuralizer.TeachAddPattern("Person", knownCharacter, mp => mp.Add(new PatternUnit(P.Single().WithToken(knownCharacter))));
        neuralizer.TeachAddPattern("Person", knownCharacter.ToUpper(), mp => mp.Add(new PatternUnit(P.Single().WithToken(knownCharacter.ToUpper()))));
        spotter.AddEntry(knownCharacter);
        spotter.AddEntry(knownCharacter.ToUpper());
    }

    nlp.UseNeuralyzer(neuralizer);

    nlp.Add(spotter);

    var doc = new Document(text, Language.English);

    nlp.ProcessSingle(doc);

    var results = doc.SelectMany(span => span.GetEntities()).Where(e => e.EntityType.Type == "Person").Select(e => $"{e.Value}").Distinct().ToList();

    results.ToList().ForEach(Console.WriteLine);

    return results;
}

private async Task<(string, string, Dictionary<string, int>)> AnonymizeCharacters(string originalFull, string originalSummary, List<string> knownCharacterNames) {
    var detectedNames = new List<string>(); //knownCharacterNames;//await GetCharacterNames(originalFull + "\n\n" + originalSummary, knownCharacterNames);
    var detectedNamesLower = detectedNames.Select(n => n.ToLower()).ToList();
    Console.WriteLine("DETECTED: " + string.Join(", ", detectedNames));

    var namesToIndex = new Dictionary<string, int>();

    // build a list of all possible ways the same character might be referenced, and assign a character index to keep track. For example:
    // Tony = 0
    // Stark = 0
    // Tony Stark = 0
    // Obadiah = 1
    // Stane = 1
    // Obadiah Stane = 1

    var curNameNum = 0;

    foreach(var name in knownCharacterNames) {
        var upperOrigName = name.ToUpper();

        var allNames = new List<string>{
            name,
            upperOrigName
        };

        var origNameFirst = "";
        var origNameFirstUpper = "";

        if (name.Contains(" ")) {
            origNameFirst = name.Split(' ')[0];
            origNameFirstUpper = origNameFirst.ToUpper();

            allNames.Add(origNameFirst);
            allNames.Add(origNameFirstUpper);
        }

        // ensure the full or summary text contains either full name or first name
        if (allNames.Any(name => originalFull.Contains(name)) == false && allNames.Any(name => originalSummary.Contains(name)) == false) {
            continue;
        }

        namesToIndex.Add(name, curNameNum);

        if (namesToIndex.ContainsKey(name.ToUpper()) == false) {
            namesToIndex.Add(name.ToUpper(), curNameNum);
        }

        var nameParts = name.Split(' ').ToList();

        foreach(var part in nameParts) {
            if (namesToIndex.ContainsKey(ToTitleCase(part)) == false) {
                namesToIndex.Add(ToTitleCase(part), curNameNum);
            }

            if (namesToIndex.ContainsKey(part.ToUpper()) == false) {
                namesToIndex.Add(part.ToUpper(), curNameNum);
            }
            
            if (detectedNamesLower.Contains(part.ToLower())) {
                var detectedName = detectedNamesLower.Where(n => n == part.ToLower()).First();

                if (namesToIndex.ContainsKey(ToTitleCase(detectedName)) == false) {
                    namesToIndex.Add(ToTitleCase(detectedName), curNameNum);
                }

                if (namesToIndex.ContainsKey(detectedName.ToUpper()) == false) {
                    namesToIndex.Add(detectedName.ToUpper(), curNameNum);
                }
            }
        }

        curNameNum += 1;
    }

    foreach(var name in detectedNames) {
        if (namesToIndex.ContainsKey(name) == false) {
            namesToIndex.Add(name, curNameNum);

            if (namesToIndex.ContainsKey(ToTitleCase(name)) == false) {
                namesToIndex.Add(ToTitleCase(name), curNameNum);
            }

            if (namesToIndex.ContainsKey(name.ToUpper()) == false) {
                namesToIndex.Add(name.ToUpper(), curNameNum);
            }

            curNameNum += 1;
        }
    }
    
    // print final lookup dictionary
    //namesToIndex.Select(i => $"{i.Key}: {i.Value}").ToList().ForEach(Console.WriteLine);

    var separator = " ===SG=== ";

    var result = originalFull + separator + originalSummary;

    foreach(var name in namesToIndex.Keys.OrderByDescending(x => x.Length)) {
        //Console.WriteLine($"replacing: {name}");
        result = result.Replace(name, $"CHARACTER{namesToIndex[name]}");
        result = result.Replace(name.ToUpper(), $"CHARACTER{namesToIndex[name]}");
    }

    var resultParts = result.Split(separator);

    return (resultParts[0], resultParts[1], namesToIndex);
}

private string DeAnonymize(string text, Dictionary<string, int> namesToIndex, bool useShortestName) {
    // given a string like "CHARACTER0 talks with CHARACTER1" we use namesToIndex to replace them with the shortest real names found

    for(var i=0; i < 10; i++) { // assume max of 10 possible characters
        var possibleNames = namesToIndex.Where(kvp => kvp.Value == i).ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
        //possibleNames.Select(i => $"{i.Key}: {i.Value}").ToList().ForEach(Console.WriteLine);

        var possibleNamesList = possibleNames.Select(kvp => kvp.Key);
        
        if (useShortestName) {
            possibleNamesList = possibleNamesList.OrderBy(x => x.Length);
        } else {
            possibleNamesList = possibleNamesList.OrderByDescending(x => x.Length);
        }
        
        var shortestName = possibleNamesList.ToList().FirstOrDefault();

        if (shortestName != null) {
            text = text.Replace($"CHARACTER{i}", shortestName);
        }
    }

    return text;
}

In [None]:
var test1 = "EDWARD spoke to Will in his Paris apartment, after going to Pizza Hut for dinner with Mary and SPIDER MAN. Later, Mary and WILL go dancing.";
var test2 = "Tony and Stane fight as Spider Man and another man look on. STARK grabs OBADIAH and SPIDER MAN and the other man are like whoa.";
var test3 = "Tony Stark and Obadiah Stane fight.";
//List<string> knownCharacterNames = new(){ "Edward", "Will", "Mary", "Spider Man" }; // optional list we can get from the Character list. Names like "Will" and "Spider Man" don't match on their own
List<string> knownCharacterNames = new(){ "Tony Stark", "Obadiah Stane", "Spider Man" };
//List<string> knownCharacterNames = new(){ "Tony", "Obadiah", "Spider Man" };
//List<string> knownCharacterNames = new();

var (anonymizedFullText, anonymizedSummaryText, namesToIndex) = await AnonymizeCharacters(test2, "summary goes here", knownCharacterNames);

anonymizedFullText

CHARACTER0 and CHARACTER1 fight as CHARACTER2 and another man look on. CHARACTER0 grabs CHARACTER1 and CHARACTER2 and the other man are like whoa.

In [None]:
var reverseTest1 = "CHARACTER0 and CHARACTER1 fight as CHARACTER2 and another man look on. CHARACTER0 grabs CHARACTER1 and CHARACTER2 and the other man are like whoa.";

var deAnonTest = DeAnonymize(reverseTest1, namesToIndex, false);
deAnonTest

Tony Stark and Obadiah Stane fight as Spider Man and another man look on. Tony Stark grabs Obadiah Stane and Spider Man and the other man are like whoa.

In [None]:
var realFull0 = @"
INT. JACK'S BEDROOM - NIGHT

Jack lies in bed, staring at the ceiling.

JACK (V.O.)
For six months. I could not sleep.

INT. COPY ROOM - DAY

Jack, sleepy, stands over a copy machine. His Starbucks cup sits on the lid, moving back and forth as the machine copies.

JACK (V.O.)
With insomnia, nothing is real. Everything is far away. Everything is a copy of a copy of a copy. 

Other people make copies, all with Starbucks cups, sipping.

Jack picks up his cup and his copies and leaves.

INT. JACK'S OFFICE - SAME

Jack, sipping, stares blankly at a Starbucks bag on the floor, full of newspapers and FAST FOOD GARBAGE.

JACK (V.O.)
When deep space exploration ramps up, it will be corporations that name everything. The IBM Stellar Sphere. The Philip Morris Galaxy. Planet Starbucks.

Jack looks up as a pudgy man, Jack's BOSS, enters, Starbucks cup in hand, and slides a stack of reports on Jack's desk.

BOSS
I'm going to need you out-of-town a little more this week. We've got some ""red-flags"" to cover.

JACK (V.O.)
It must've been Tuesday. he was wearing his ""cornflower-blue"" tie.

JACK
(listless management speak)
You want me to de-prioritize my current reports until you advise of a status upgrade?

BOSS
You need to make these your primary ""action items.""

JACK (V.O.)
He was full of pep. Must've had his grande latte enema.

BOSS
Here are your flight coupons. Call me from the road if there are any snags. Your itinerary...

Jack hides a yawn, pretends to listen.
";

var realFull1 = @"
EXT. BLOOM HOUSE [MID/LATE '70'S] - DAY 

Sandra is watering the garden. Will (5) runs past her to greet Edward, just returned from another trip.

EDWARD (V.O.)
Ten thousand dollars is no fortune to most men. But it was enough to buy my wife a proper house with a white picket fence.

We reveal the Bloom house, the nicest one in the neighborhood. Edward kisses his wife.

EDWARD (V.O.)
And for that, it was all the riches a man could ever want.

Sandra drops the hose, letting it run on the lawn.

TRANSITION TO:
";

var realSummary0 = "";
var realSummary1 = "Edward returns home to his wife Sandra, and young Will. Edward spends the $10,000 on buying a nice house for his family.";

List<string> knownCharacterNames0 = new(){"Jack", "Boss"};
List<string> knownCharacterNames1 = new(){"Edward Bloom", "Don Price", "Sandra Templeton", "Will"};

var (anonymizedFullText, anonymizedSummaryText, namesToIndex) = await AnonymizeCharacters(realFull0, realSummary0, knownCharacterNames0);

anonymizedFullText

DETECTED: 



INT. CHARACTER0'S BEDROOM - NIGHT

CHARACTER0 lies in bed, staring at the ceiling.

CHARACTER0 (V.O.)
For six months. I could not sleep.

INT. COPY ROOM - DAY

CHARACTER0, sleepy, stands over a copy machine. His Starbucks cup sits on the lid, moving back and forth as the machine copies.

CHARACTER0 (V.O.)
With insomnia, nothing is real. Everything is far away. Everything is a copy of a copy of a copy. 

Other people make copies, all with Starbucks cups, sipping.

CHARACTER0 picks up his cup and his copies and leaves.

INT. CHARACTER0'S OFFICE - SAME

CHARACTER0, sipping, stares blankly at a Starbucks bag on the floor, full of newspapers and FAST FOOD GARBAGE.

CHARACTER0 (V.O.)
When deep space exploration ramps up, it will be corporations that name everything. The IBM Stellar Sphere. The Philip Morris Galaxy. Planet Starbucks.

CHARACTER0 looks up as a pudgy man, CHARACTER0's CHARACTER1, enters, Starbucks cup in hand, and slides a stack of reports on CHARACTE

In [None]:
namesToIndex

key,value
Edward Bloom,0
EDWARD BLOOM,0
Edward,0
EDWARD,0
Bloom,0
BLOOM,0
Sandra Templeton,1
SANDRA TEMPLETON,1
Sandra,1
SANDRA,1


In [None]:
// for the admin workflow, I start with a full screenplay scene, anonymize it, which generates an anonymized summary, then
// I need to replace back with real character names. In this case, it's ok that we lose the all uppercase of the screenplay format.
var deAnonTest = DeAnonymize(anonymizedSummaryText, namesToIndex, false);
deAnonTest

Amos Calloway turns back into a human, and thanks Edward Hill, telling him the girl's name is Sandra Templeton, and she's going to Auburn. Edward Hill bids farewell to Karl and heads to Auburn.