In [None]:
using Pkg
Pkg.activate(".")

In [2]:
using SimilaritySearch, SimSearchManifoldLearning, TextSearch, CSV, DataFrames, LinearAlgebra, CategoricalArrays, JLD2, HypertextLiteral

In [3]:
function text_model(corpus, labels=nothing; config=nothing, mindocs=10)
    if config === nothing
        config = TextConfig(
            group_usr=true,
            group_url=true,
            del_diac=true,
            lc=true,
            group_num=true,
            nlist=[1, 2],
            qlist=[4]
        )
    end

    tok = Tokenizer(config)
    model = if labels !== nothing
        VectorModel(EntropyWeighting(), BinaryLocalWeighting(), compute_bow_corpus(tok, corpus), labels; mindocs)
    else
        VectorModel(BinaryGlobalWeighting(), BinaryLocalWeighting(), compute_bow_corpus(tok, corpus); mindocs)
    end
    vectors = vectorize_corpus(tok, model, corpus)
    
    (tok=tok, model=model, vectors=vectors)
end

text_model (generic function with 2 methods)

In [4]:
function infotxt(D, i, k=10)
    s = []
    for j in 1:k
        push!(s, @htl """<div style="margin: 0.5em;">- $(D.tweet[i][j])</div>""")
    end
   
    #x: $(D.X[i]), y: $(D.Y[i]),
    
    @htl """
    <div>
    <h3>metadata</h3>
    
    gender: <b>$(D.gender[i])</b>,
    profession: <b>$(D.profession[i])</b>, 
    ideology bin: <b>$(D.ideology_binary[i])</b>, 
    ideology multi: <b>$(D.ideology_multiclass[i])</b>
    </div>
    <h3>messages</h3>
    $s
    """
        
 end

infotxt (generic function with 2 methods)

In [5]:
function get_data_collection(filename)

    D = CSV.read(filename, DataFrame)

    # Users as collection of messages
    E = DataFrame(Column1=Int[], label=String[], gender=String[], profession=String[], ideology_binary=String[], ideology_multiclass=String[], tweet=Vector{String}[])
    for g in groupby(D, :label)
        push!(E, (first(g.Column1), first(g.label), first(g.gender), first(g.profession), first(g.ideology_binary), first(g.ideology_multiclass), g.tweet))
    end
    
    #@show names(E), size(E)
    D = E

    return D
end

get_data_collection (generic function with 1 method)

In [6]:
function get_testdata_collection(filename)

    D = CSV.read(filename, DataFrame)

    # Users as collection of messages
    E = DataFrame(Column1=Int[], label=String[], tweet=Vector{String}[])
    for g in groupby(D, :label)
        push!(E, (first(g.Column1), first(g.label), g.tweet))
    end
    
    #@show names(E), size(E)
    D = E

    return D
end

get_testdata_collection (generic function with 1 method)

In [7]:
function build_umodel(df, ds, class=nothing)
    
    tmodel = if class === nothing
        text_model(ds)
    else
        text_model(ds, categorical(df[:,  Symbol(class)]))
    end
    index = ExhaustiveSearch(; db=VectorDatabase(tmodel.vectors), dist=NormalizedCosineDistance())
    
    umodel = fit(UMAP, index; k=30, n_epochs=100, layout=SpectralLayout(), neg_sample_rate=3, maxoutdim = 3)
    
    #embedding = umodel.embedding
    
    (umodel=umodel, text_model=tmodel)

end

build_umodel (generic function with 2 methods)

In [8]:
function build_model_projected(tmodel, umodel, ds)
    vectors = vectorize_corpus(tmodel.tok, tmodel.model, ds)
    emb = predict(umodel, vectors)

    #return emb
    (vectors=vectors, emb=emb)

end

build_model_projected (generic function with 1 method)

In [None]:
#Training data
finame_train = "C:\\training.csv"
df_train = get_data_collection(finame_train)

In [None]:
#test data
finame_test =  "C:\\df_test.csv"
df_test = get_testdata_collection(finame_test)

In [None]:
#Build umodel based on training data.
# "gender", "profession", "ideology_binary", "ideology_multiclass"
omodel_train = nothing
class = :ideology_multiclass
omodel_train = build_umodel(df_train, df_train.tweet, class )

In [None]:
#Project train data into umap train data
proj_model = nothing
proj_model = build_model_projected(omodel_train.text_model, omodel_train.umodel, df_train.tweet)


In [None]:
#Project test data into umap train data
proj_model = nothing
proj_model = build_model_projected(omodel_train.text_model, omodel_train.umodel,df_test.tweet)

In [None]:
#Subset just with classes
df_train_c = df_train[:, [:gender, :profession, :ideology_binary, :ideology_multiclass]]

In [None]:
df_test_c = df_test[:,[:Column1, :label]]

In [None]:
emb_train = hcat(emb_train_gen', emb_train_prof', emb_train_ideob', emb_train_ideom')
emb_test = hcat(emb_test_gen', emb_test_prof', emb_test_ideob', emb_test_ideom')

In [21]:
using NPZ
npzwrite("C:\\data\\emb_train.npy", emb_train)
npzwrite("C:\\data\\emb_test.npy", emb_test)

In [None]:
CSV.write("C:\\data\\df_class_train.csv",df_train_c)
CSV.write("C:\\data\\df_test.csv",df_test)

### Charts

In [28]:
using WebIO
using PlotlyJS

In [None]:
D=df_test

In [None]:
#Two dimensions

D[:, :X] = proj_model.emb[1, :]
D[:, :Y] = proj_model.emb[2, :]


label_colors = class
L = categorical(D[:, label_colors])

P = plot(
    D, x=:X, y=:Y, mode="markers", color=label_colors
    #,Layout(title="$label_colors")
    ,marker=attr(size=6, line=attr(width=0.5, color="DarkSlateGrey"))
    ,dpi=100
)

on(P["click"]) do data
    IJulia.clear_output()
    display(P)
    points = get(data, "points", ())
    
    for p in points
        i = p["pointIndex"] + 1
        k = 100
        res = KnnResult(k)
        tokens = collect(keys(proj_model.vectors[i]))
        weights = collect(values(proj_model.vectors[i]))
        
        for (j, w) in enumerate(weights)
            push!(res, j, -w)
        end
        
        S = []
        for (id, w) in res
            w = -w
            token = decode(omodel_train.text_model.tok, tokens[id])
            token = replace(token, r"^\t(.)" => s"@\1~", r"\s" => "~")
            push!(S, @htl """<span style="margin: 0.5em;"><b>'$(token)'</b>:&nbsp;$(round(w, digits=3))</span> """)
        end
        
        display(@htl """
        <h3>most characteristic $k tokens from $(length(tokens)) available (vector $i)</h3>
        $S
        """)
        display(infotxt(D, i, 30))
    end
end

P
savefig(P,"C:\\Test_ideom_2D.pdf")