In [1]:
using System;
using System.IO;
using System.Collections.Generic;
using System.Linq;

# The plan

We scan the list in two passes.

* In the first pass, we collect the known types and occupations for each node.
* In the second pass, for each row, we look up the matching type/occupation and record a count of this property on that type.

Then we can use this to write out a table of type-property-counts:

    type    property    count

We can then use this table to rate nodes.

# Configuration

In [2]:
// data source
var edges = "/scratch/trh6u/db_csv/WD_entity_prop_entity.csv";

In [3]:
// property of interest
const int p_InstanceOf = 31;
const int p_SubclassOf = 279;
var propertyOfInterest =  p_InstanceOf;
    // instance of 31;

In [4]:
var test_path = $@"/scratch/jag2j/final_data/test1000_10plus.csv";

In [5]:
var testIds =
    File.ReadLines(test_path)
        .Skip(1) // header
        .Select(x => x.Split(",")[1])
        .Select(long.Parse)
        .ToHashSet();

In [6]:
testIds.Take(5)

index,value
0,2723552
1,34107551
2,37144419
3,4221945
4,53638750


# Data Model

# Reader functions

In [7]:
// always empty list
var empty = new List<long>();

In [8]:
class Edge {
    public long S;
    public long P;
    public long O;
    public Edge(long s, long p, long o) {
        S = s; P = p; O = o;
    }
}

In [9]:
// Read the lines of a file, passing them through a parse function
// and returning the results for those successfully parsed.
//
// You can optional set a maximum line count.
IEnumerable<T> Read<T>(TextReader tr, Func<string, T> parse, int maxCount = -1) 
{
   for(var i = 0; maxCount == -1 || i < maxCount; i++)
   {
      var line = tr.ReadLine();
      if (line == null) yield break;
      var parsed = parse(line);
      if(parsed != null) {
         yield return parsed;
      }
   }
}

In [10]:
// Parse an edge from the edge file (src, dst).
// Return null if the line is corrupt or edges are not numeric.
 Edge ParseEdge(string line) {
   var fields = line.Split(',');
   if(fields.Length < 3) return null;
   if(!long.TryParse(fields[0], out var e1)) return null;
   if(!long.TryParse(fields[1], out var e2)) return null;
   if(!long.TryParse(fields[2], out var e3)) return null;
   return new Edge(e1, e2, e3);
}

In [11]:
// Open a reader on a file
TextReader GetReader(string path)
{
   var fr = new FileStream(path, FileMode.Open, FileAccess.Read, FileShare.ReadWrite);
   var tr = new StreamReader(fr); // auto-close fr when tr closes
   return tr;
}

# Build type map

In [12]:
// entites :: entity ID -> ID of entity representing type (i.e., the superentity)
var testTypes = new Dictionary<long,List<long>>();
var entities = new Dictionary<long, long>(500_000_000);

using(var r = GetReader(edges)) {
    foreach(var res in Read(r, ParseEdge))
    {
        // capture test type
        if(testIds.Contains(res.S) && res.P == propertyOfInterest)
        {
            if(!testTypes.TryGetValue(res.S, out var testEntity)) {
                testTypes[res.S] = new List<long> { res.O };
            }
            else testTypes[res.S].Add(res.O);
        }
        
        // for processing training entities list, drop test data
        if(testIds.Contains(res.S) || testIds.Contains(res.P))
        {
            continue; // skip the below processing to exclude test data from training data
        }
        
        if(!entities.TryGetValue(res.S, out var entity)) {
            entities[res.S] = 0; // default to 0
        }

        if(res.P == propertyOfInterest) {
            entities[res.S] = res.O;            
        }
    }
}

In [13]:
entities.Take(10)

index,Key,Value
0,31,3624078
1,8,9415
2,23,5
3,24,15773317
4,42,5
5,1868,5
6,2013,15633582
7,45,20181813
8,51,82794
9,58,0


In [14]:
testTypes.Take(5)

index,Key,Value
0,7795,"[ 484652, 431603, 245065 ]"
1,43789,[ 5 ]
2,89050,[ 5 ]
3,290808,[ 5 ]
4,340435,[ 5 ]


Let's make sure the ID is skipped.

In [15]:
var sampleTestId = testIds.FirstOrDefault();    
sampleTestId

In [16]:
entities.Where(e => e.Key == sampleTestId || e.Value == sampleTestId)

In [17]:
testTypes.Where(e=>e.Key == sampleTestId || e.Value.Contains(sampleTestId))

index,Key,Value
0,2723552,[ 5 ]


In [18]:
using(var f = new FileStream($"/scratch/jag2j/final_data/{propertyOfInterest}-test-types_10plus.csv", FileMode.Create)) 
using(var tw = new StreamWriter(f))
{
    tw.AutoFlush = true;
    tw.WriteLine("entity,type");
    foreach(var kv in testTypes)
    foreach(var v in kv.Value.Distinct())
    {
      var entity = kv.Key;
      var type = v;
      tw.WriteLine($"{entity},{type}");
    }
}

In [19]:
using(var f = new FileStream($"/scratch/jag2j/final_data/{propertyOfInterest}-train-types_10plus.csv", FileMode.Create)) 
using(var tw = new StreamWriter(f))
{
    tw.AutoFlush = true;
    tw.WriteLine("entity,type");
    foreach(var kv in entities)
    {
      var entity = kv.Key;
      var type = kv.Value;
      tw.WriteLine($"{entity},{type}");
    }
}

# Build type/prop count

In [20]:
// set { (subject, prop) }
var seen = new HashSet<Tuple<long, long>>();

// (type, prop) -> count (= sp_mat)
var typeprops = new Dictionary<Tuple<long, long>, long>();

using(var r = GetReader(edges)) {
    foreach(var res in Read(r, ParseEdge)) {
        if(testIds.Contains(res.S) || testIds.Contains(res.P)) continue; // skip test data
        
        var type = entities[res.S];
        var key = Tuple.Create(type, res.P);
        
        // skip if this subj/prop is not new
        var subjProp = Tuple.Create(res.S, res.P);
        if(!seen.Add(subjProp)) {
            continue; 
        }
        
        if(!typeprops.TryGetValue(key, out var cnt)) {
            typeprops[key] = 1;
        }
        else typeprops[key] += 1;
    }
}

// permit seen memory to be freed
seen = null;

In [21]:
typeprops.Take(10)

index,Key,Value
0,"( 3624078, 1344 )",8
1,"( 3624078, 1151 )",39
2,"( 3624078, 1546 )",8
3,"( 3624078, 5125 )",39
4,"( 3624078, 38 )",39
5,"( 3624078, 1792 )",39
6,"( 3624078, 2852 )",39
7,"( 3624078, 2853 )",38
8,"( 3624078, 2633 )",39
9,"( 3624078, 1313 )",39


In [22]:
using(var f = new FileStream($"/scratch/jag2j/final_data/aux/{propertyOfInterest}_type-property-count_10plus.csv", FileMode.Create)) 
using(var tw = new StreamWriter(f))
{
    tw.AutoFlush = true;
    tw.WriteLine("type,property,count");
    foreach(var kv in typeprops) {
        var type = kv.Key.Item1;
        var prop = kv.Key.Item2;
        var cnt = kv.Value;
        tw.WriteLine($"{type},{prop},{cnt}");
    }
}

# Compute totals and frequency matrix

In [23]:
// type -> # of entities with the type (= se_sum)
var totals =
    // type x entity
    entities.GroupBy(e => e.Value)
        .ToDictionary(e => e.Key, e => e.Count());

using(var f = new FileStream($"/scratch/jag2j/final_data/aux/{propertyOfInterest}_se_sum_10plus.csv", FileMode.Create)) 
using(var tw = new StreamWriter(f))
{
    tw.AutoFlush = true;
    tw.WriteLine("type,count");
    foreach(var kv in totals) {
        tw.WriteLine($"{kv.Key},{kv.Value}");

    }
}

In [24]:
var freqList = new List<(long type, long prop, float freq)>();

using(var f = new FileStream($"/scratch/jag2j/final_data/{propertyOfInterest}-train_freq_matrix_10plus.csv", FileMode.Create)) 
using(var tw = new StreamWriter(f))
{
    tw.AutoFlush = true;
    tw.WriteLine("type,property,frequency");
    foreach(var kv in typeprops) {
        var type = kv.Key.Item1;
        var prop = kv.Key.Item2;
        float freq = kv.Value;
        if(totals.TryGetValue(type, out var typeSubjectCount)) {
            if(typeSubjectCount > 0)
              freq /= typeSubjectCount;
        }
        freqList.Add((type, prop, freq));
        tw.WriteLine($"{type},{prop},{freq}");
    }
}

In [30]:
freqList

index,Item1,Item2,Item3
0,3624078,1344,0.17391305
1,3624078,1151,0.84782606
2,3624078,1546,0.17391305
3,3624078,5125,0.84782606
4,3624078,38,0.84782606
5,3624078,1792,0.84782606
6,3624078,2852,0.84782606
7,3624078,2853,0.82608694
8,3624078,2633,0.84782606
9,3624078,1313,0.84782606


In [27]:
freqList.Count()

In [29]:
typeprops.Count()