In [None]:
#r "nuget: Catalyst, 1.0.31087"
#r "nuget: Catalyst.Models.English, 1.0.30952"

In [None]:

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Catalyst;
using Catalyst.Models;
using Version = Mosaik.Core.Version;
using Mosaik.Core;
using P = Catalyst.PatternUnitPrototype;
using System.Globalization;
using System.Text.RegularExpressions;

In [None]:
Catalyst.Models.English.Register(); //You need to pre-register each language (and install the respective NuGet Packages)

In [None]:
public string ToTitleCase(string text)
{
    TextInfo myTI = new CultureInfo("en-US", false).TextInfo;
    return myTI.ToTitleCase(text.ToLower());
}

public string AllCapsToTitleCase(string text) {
    string pattern = @"\b[A-Z]{2,}\b";

    var res = Regex.Replace(text, pattern, match => ToTitleCase(match.Value));

    return res;
}

private async Task<List<string>> GetCharacterNames(string text, List<string> knownCharacterNames) {
    text = AllCapsToTitleCase(text);
    var nlp = await Pipeline.ForAsync(Language.English);
    //nlp.RemoveAllNeuralizers();
    nlp.RemoveAll(p => true); // remove previously added entities
    nlp = await Pipeline.ForAsync(Language.English);
    nlp.Add(await AveragePerceptronEntityRecognizer.FromStoreAsync(language: Language.English, version: Version.Latest, tag: "WikiNER"));

    //var neuralizer = new Neuralyzer(Language.English, 0, "WikiNER-sample-fixes");

    var spotter = new Spotter(Language.Any, 0, "character name", "Person");

    foreach(var knownCharacter in knownCharacterNames) {
        //Teach the Neuralyzer class to add the entity type Person for a match for the single token "knownCharacter"
        //neuralizer.TeachAddPattern("Person", knownCharacter, mp => mp.Add(new PatternUnit(P.Single().WithToken(knownCharacter))));
        //neuralizer.TeachAddPattern("Person", knownCharacter.ToUpper(), mp => mp.Add(new PatternUnit(P.Single().WithToken(knownCharacter.ToUpper()))));
        spotter.AddEntry(knownCharacter);
        spotter.AddEntry(knownCharacter.ToUpper());
    }

    //nlp.UseNeuralyzer(neuralizer);

    nlp.Add(spotter);

    var doc = new Document(text, Language.English);

    nlp.ProcessSingle(doc);

    var results = doc.SelectMany(span => span.GetEntities()).Where(e => e.EntityType.Type == "Person").Select(e => $"{e.Value}").Distinct().ToList();

    results.ToList().ForEach(Console.WriteLine);

    return results;
}

private async Task<(string, string, Dictionary<string, int>)> AnonymizeCharacters(string originalFull, string originalSummary, List<string> knownCharacterNames) {
    var detectedNames = await GetCharacterNames(originalFull + "\n\n" + originalSummary, knownCharacterNames);
    var detectedNamesLower = detectedNames.Select(n => n.ToLower()).ToList();
    Console.WriteLine("DETECTED: " + string.Join(", ", detectedNames));

    var namesToIndex = new Dictionary<string, int>();

    // build a list of all possible ways the same character might be referenced, and assign a character index to keep track. For example:
    // Tony = 0
    // Stark = 0
    // Tony Stark = 0
    // Obadiah = 1
    // Stane = 1
    // Obadiah Stane = 1

    var curNameNum = 0;

    foreach(var name in knownCharacterNames) {
        namesToIndex.Add(name, curNameNum);

        var nameParts = name.Split(' ').ToList();

        foreach(var part in nameParts) {
            if (namesToIndex.ContainsKey(ToTitleCase(part)) == false) {
                namesToIndex.Add(ToTitleCase(part), curNameNum);
            }

            if (namesToIndex.ContainsKey(part.ToUpper()) == false) {
                namesToIndex.Add(part.ToUpper(), curNameNum);
            }
            
            if (detectedNamesLower.Contains(part.ToLower())) {
                var detectedName = detectedNamesLower.Where(n => n == part.ToLower()).First();

                if (namesToIndex.ContainsKey(ToTitleCase(detectedName)) == false) {
                    namesToIndex.Add(ToTitleCase(detectedName), curNameNum);
                }

                if (namesToIndex.ContainsKey(detectedName.ToUpper()) == false) {
                    namesToIndex.Add(detectedName.ToUpper(), curNameNum);
                }
            }
        }

        curNameNum += 1;
    }

    foreach(var name in detectedNames) {
        if (namesToIndex.ContainsKey(name) == false) {
            namesToIndex.Add(name, curNameNum);

            if (namesToIndex.ContainsKey(ToTitleCase(name)) == false) {
                namesToIndex.Add(ToTitleCase(name), curNameNum);
            }

            if (namesToIndex.ContainsKey(name.ToUpper()) == false) {
                namesToIndex.Add(name.ToUpper(), curNameNum);
            }

            curNameNum += 1;
        }
    }
    
    // print final lookup dictionary
    //namesToIndex.Select(i => $"{i.Key}: {i.Value}").ToList().ForEach(Console.WriteLine);

    var separator = " ===SG=== ";

    var result = originalFull + separator + originalSummary;

    foreach(var name in namesToIndex.Keys.OrderByDescending(x => x.Length)) {
        //Console.WriteLine($"replacing: {name}");
        result = result.Replace(name, $"CHARACTER{namesToIndex[name]}");
        result = result.Replace(name.ToUpper(), $"CHARACTER{namesToIndex[name]}");
    }

    var resultParts = result.Split(separator);

    return (resultParts[0], resultParts[1], namesToIndex);
}

private string DeAnonymize(string text, Dictionary<string, int> namesToIndex, bool useShortestName) {
    // given a string like "CHARACTER0 talks with CHARACTER1" we use namesToIndex to replace them with the shortest real names found

    for(var i=0; i < 10; i++) { // assume max of 10 possible characters
        var possibleNames = namesToIndex.Where(kvp => kvp.Value == i).ToDictionary(kvp => kvp.Key, kvp => kvp.Value);
        //possibleNames.Select(i => $"{i.Key}: {i.Value}").ToList().ForEach(Console.WriteLine);

        var possibleNamesList = possibleNames.Select(kvp => kvp.Key);
        
        if (useShortestName) {
            possibleNamesList = possibleNamesList.OrderBy(x => x.Length);
        } else {
            possibleNamesList = possibleNamesList.OrderByDescending(x => x.Length);
        }
        
        var shortestName = possibleNamesList.ToList().FirstOrDefault();

        if (shortestName != null) {
            text = text.Replace($"CHARACTER{i}", shortestName);
        }
    }

    return text;
}

In [None]:
var test1 = "EDWARD spoke to Will in his Paris apartment, after going to Pizza Hut for dinner with Mary and SPIDER MAN. Later, Mary and WILL go dancing.";
var test2 = "Tony and Stane fight as Spider Man and another man look on. STARK grabs OBADIAH and SPIDER MAN and the other man are like whoa.";
var test3 = "Tony Stark and Obadiah Stane fight.";
//List<string> knownCharacterNames = new(){ "Edward", "Will", "Mary", "Spider Man" }; // optional list we can get from the Character list. Names like "Will" and "Spider Man" don't match on their own
List<string> knownCharacterNames = new(){ "Tony Stark", "Obadiah Stane", "Spider Man" };
//List<string> knownCharacterNames = new(){ "Tony", "Obadiah", "Spider Man" };
//List<string> knownCharacterNames = new();

var (anonymizedFullText, anonymizedSummaryText, namesToIndex) = await AnonymizeCharacters(test2, "summary goes here", knownCharacterNames);

anonymizedFullText

CHARACTER0 and CHARACTER1 fight as CHARACTER2 and another man look on. CHARACTER0 grabs CHARACTER1 and CHARACTER2 and the other man are like whoa.

In [None]:
var reverseTest1 = "CHARACTER0 and CHARACTER1 fight as CHARACTER2 and another man look on. CHARACTER0 grabs CHARACTER1 and CHARACTER2 and the other man are like whoa.";

var deAnonTest = DeAnonymize(reverseTest1, namesToIndex, false);
deAnonTest

Tony Stark and Obadiah Stane fight as Spider Man and another man look on. Tony Stark grabs Obadiah Stane and Spider Man and the other man are like whoa.

In [None]:
var realFull0 = @"
INT. ROSSI HOME - RUBY'S ROOM - EVENING

Gertie sprawls across Ruby's bed. Ruby takes out a record and puts it on an old Fisher Price record player.

GERTIE
Does he work out, or are those like fishing muscles?

RUBY
OK, stop. You cannot date my brother.

GERTIE
Why?

RUBY
Because for you, ""dating"" just means sex, which I don't even want to think about!

GERTIE
Okay, I'm sure he doesn't need his little sister protecting him.

RUBY
Will you go back to trying to get with teachers? It's more entertaining.

Ruby drops the needle onto the record. ""My Pal Foot Foot"" by the Shaggs plays. Ruby dances across the room to Gertie.

GERTIE
Oh, okay.
(re: the music)
What the hell are we listening to?

Ruby grins and flops onto the bed next to Gertie, holding up the record cover with delight.

RUBY
The Shaggs!
(laughs)
This song is called ""My Pal Foot Foot.""

GERTIE
Ruby.

RUBY
Wait, wait...

The chorus of the song kicks in. It's weird.

RUBY
That's my favorite part, right there!

GERTIE
You find the weirdest shit. Did you get this in the dollar bin?
(beat)
Wait, I have a serious question. What's the sign for, um, for ""You're really smoking hot""?

RUBY
No.

GERTIE
What about like ""We should totally get it on""?

RUBY
No!

GERTIE
Is it just... this?

Gertie thrusts her hips up and down.

RUBY
Oh my God. No. Stop!

GERTIE
Then show me!

Ruby gives her a hard look, but then smiles, caving.
";

var realFull1 = @"
EXT. SORORITY HOUSE - DAY

Sandra walks out to him. She's smiling, confused, joyful and scared. All down Greek Street, STUDENTS are coming out to see the display.

SANDRA
Daffodils?

EDWARD
They're your favorite flower.

SANDRA
How did you get so many?

EDWARD
I called everywhere in five states and explained this was the only way I could get my wife to marry me.

Out of nowhere, a tear drops down Sandra's cheek. She wipes it off.

SANDRA
You don't even know me.

EDWARD
I have the rest of my life to find out.

From down the street...

A MAN'S VOICE
Sandra!

SANDRA
It's Don. Promise me you won't hurt him.

EDWARD
If that's what you want, I swear to it.

The adult DON PRICE arrives. He's 230 pounds of football-playing, Skynard-loving, fraternity-proud muscle. And he's pissed.

A gang of his BROTHERS walk behind him.

DON PRICE
Bloom!

EDWARD
Don.

DON PRICE
What the hell are you doing? This is my girl. Mine!

EDWARD
I didn't know she belonged to anybody.

Don Price decks him, knocking him down. Edward gets right back up, but makes no move to defend himself.

Unfazed, Don slugs him again.

SANDRA
Stop it!

DON PRICE
(ignoring)
What the matter, Bloom? Too scared to fight back?

EDWARD
I promised I wouldn't.

A beat. Don shrugs, fine. Then proceeds to kick Edward's ass nine ways to Sunday.

EDWARD (V.O.) (CONT'D)
While I took the beating of a lifetime, it was Don Price who was ultimately defeated.

As the ass-whupping continues, we

INTERCUT WITH:
";

var realSummary0 = "In Ruby's room, Gertie talks more about how hot Leo is. Ruby plays a record by The Shaggs, which Gertie finds weird. Gertie presses Ruby mischievously shows what Gertie thinks is the sign for a flirtatious message for Leo.";
var realSummary1 = "Sandra is timid but amused as she walks out to greet Edward. Don Price and his fraternity brothers appear, and Sandra asks Edward not to hurt him. Don beats up Edward, who takes every punch to honor is promise to Sandra.";

List<string> knownCharacterNames0 = new(){"Ruby", "Gertie", "Leo"};
List<string> knownCharacterNames1 = new(){"Edward", "Don Price", "Sandra"};

var (anonymizedFullText, anonymizedSummaryText, namesToIndex) = await AnonymizeCharacters(realFull1, realSummary1, knownCharacterNames1);

//anonymizedFullText

Sandra
Sandra Daffodils
Edward
Edward I
Don Price
Don Price Bloom
Edward Don
DETECTED: Sandra, Sandra Daffodils, Edward, Edward I, Don Price, Don Price Bloom, Edward Don


In [None]:
namesToIndex

key,value
Edward,0
EDWARD,0
Don Price,1
Don,1
DON,1
Price,1
PRICE,1
Sandra,2
SANDRA,2
Sandra Daffodils,3


In [None]:
// for the admin workflow, I start with a full screenplay scene, anonymize it, which generates an anonymized summary, then
// I need to replace back with real character names. In this case, it's ok that we lose the all uppercase of the screenplay format.
var deAnonTest = DeAnonymize(anonymizedSummaryText, namesToIndex, false);
deAnonTest

Amos Calloway turns back into a human, and thanks Edward Hill, telling him the girl's name is Sandra Templeton, and she's going to Auburn. Edward Hill bids farewell to Karl and heads to Auburn.