In [1]:
# initialize the Julia file naive-bow
using Pkg
Pkg.add("Knet")
Pkg.add("Random")
using Random

[32m[1m  Updating[22m[39m registry at `~/.julia/registries/General`
[32m[1m  Updating[22m[39m git-repo `https://github.com/JuliaRegistries/General.git`
[?25l[2K[?25h[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m
[32m[1m Resolving[22m[39m package versions...
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Project.toml`
[90m [no changes][39m
[32m[1m  Updating[22m[39m `~/.julia/environments/v1.2/Manifest.toml`
[90m [no changes][39m


In [2]:
SENTENCE_SIZE = 250 # increasing this constant, increases the accuracy value, in most cases.
TR_TS_SIZE = 25000
UNKNOWN = "<unk>"
WORD_THRESHOLD = 0.425 # per sentence maximum allowed number

EFFECTIVE_RATIO = 0.05 #In order to have an effect, there must be %20 difference in scores among different classes
POS_NEG_RATIO = 1.0 # ratio of word occurences in each class (ratio of denominator of laplace smoothing)

cleanStr = (s) -> replace(s, r"[^A-Za-z]" => " ")
# change this directory wrt test environment
#dir = "/home/minuteman/academics/'19 Fall/NLP/Project-Repo/NLP-Projects/aclImdb_v1/aclImdb/"

datadir = "aclImdb_v1/aclImdb"

if !isdir(datadir)
    download(
        "https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz",
        "aclImdb_v1.tar.gz",
    )
    run(`tar xzf aclImdb_v1.tar.gz`)
end


println("---Welcome---")
println("|- The constants of the program as follows -|\n")
println((Train_Test_Size=TR_TS_SIZE,Unknown_Tag=UNKNOWN,Sentence_Size=SENTENCE_SIZE,Word_Commonality_Threshold=WORD_THRESHOLD))

---Welcome---
|- The constants of the program as follows -|

(Train_Test_Size = 25000, Unknown_Tag = "<unk>", Sentence_Size = 250, Word_Commonality_Threshold = 0.425)


In [3]:
function readandprep(dir)    
    sentences = derive_sents(dir * "/pos") # first half is positive / second half is negative
    append!(sentences,derive_sents(dir * "/neg"))
    return sentences
end

readandprep (generic function with 1 method)

In [4]:
function derive_sents(dir)
    sentences = []
    
    for file_dir in readdir(dir)
        for line in eachline(dir * "/" * file_dir)
            sentence = strip(lowercase(line))            
            words = split(sentence)
            
            slash_pos = findfirst(isequal('_'),file_dir)
            point_pos = findfirst(isequal('.'),file_dir)
            
            tag_id = file_dir[slash_pos+1:point_pos-1]
            
            cleaned_words = []
                        
            for word in words
                cleaned_word = split(cleanStr(word))                
                if length(cleaned_word)==0
                    continue
                end
                push!(cleaned_words,first(cleaned_word))
            end
            sentence = cleaned_words
                
            if length(sentence) > SENTENCE_SIZE
                sentence = sentence[1:SENTENCE_SIZE]
            else
                while first(size(sentence)) < SENTENCE_SIZE
                    push!(sentence, UNKNOWN)
                end
            end
            push!(sentences, (sentence, tag_id))
        end
    end
    
    return sentences
end

derive_sents (generic function with 1 method)

In [5]:
print("\nPreparation for training data ->")
@time train_word_tag = readandprep(datadir * "/train") # sentences and tags stored here
print("Preparation for test data ->")
@time test_word_tag = readandprep(datadir * "/test");


Preparation for training data ->  9.322294 seconds (85.52 M allocations: 3.708 GiB, 47.02% gc time)
Preparation for test data -> 11.544867 seconds (83.26 M allocations: 3.612 GiB, 59.27% gc time)


In [6]:
function get_freq_dicts(sentences)
    pos_freq = Dict{String,Float64}()
    neg_freq = Dict{String,Float64}()

    vocab_freq = Dict{String,Float64}() # To eliminate some too frequent 
    
    # We know positive tagged sentences on the front
    focused_dict = pos_freq

    class_size = TR_TS_SIZE/2
    
    for i in 1:TR_TS_SIZE # also equals to size(sentences)
        if i == class_size + 1
            pos_freq = copy(focused_dict)
            focused_dict = copy(neg_freq)
        end
        words, tag = sentences[i]
        tag_weight = ceil(abs(5.5 - parse(Int64,tag))) # Severeness of the comment
        
        #previous_words = [] # only one occurence would be enough
        
        for word in words
            vocab_freq[word] = get!(vocab_freq, word, 0) + 1
            #word in previous_words && continue
           
            if word != UNKNOWN
                word_val = get!(focused_dict, word, 0)
                focused_dict[word] = word_val + tag_weight # add tag_weight
                #push!(previous_words,word)
            end
        end
    end
    
    neg_freq = focused_dict
    
    freq_dicts = (pos_freq,neg_freq)    
    del_words = []
    freq_words = []
    
    for word in keys(pos_freq)
        !(word in keys(neg_freq)) && continue
        pos_val = pos_freq[word]
        neg_val = neg_freq[word]
        
        # If there is quite less difference, no need to have the word
        if (abs(pos_val-neg_val)<(min(pos_val,neg_val)*EFFECTIVE_RATIO))
            delete_from_both(pos_freq,neg_freq,word)
            push!(del_words,word)
        end
        
        if vocab_freq[word] >= WORD_THRESHOLD*TR_TS_SIZE
            delete_from_both(pos_freq,neg_freq,word)
            push!(freq_words,word)
        end
            
    end
    
    # number of occurences in each class would change possibilities
    total_pos = sum(values(pos_freq)) 
    total_neg = sum(values(neg_freq))
    total = total_pos + total_neg + 1 # laplace denum
    
    global POS_NEG_RATIO = (total_neg + total) / (total_pos + total) 

    return freq_dicts,del_words,freq_words
end

get_freq_dicts (generic function with 1 method)

In [7]:
function delete_from_both(pos_freq,neg_freq,word)
    delete!(pos_freq,word)
    delete!(neg_freq,word)
end

delete_from_both (generic function with 1 method)

In [8]:
println("\nPreparation for vocabulary ->")
@time freq_dicts,del_words,freq_words = get_freq_dicts(train_word_tag);


Preparation for vocabulary ->
 10.450767 seconds (105.82 M allocations: 3.285 GiB, 24.73% gc time)


In [9]:
function score_a_sent(sentence,freq;exact=false)
    pos_ratio = 1.0
        
    for word in sentence
        pos_ratio *= score_word(word,freq)
    end
    
    exact && return pos_ratio 
    return pos_ratio>=1 # true for positive, false for negative
end

score_a_sent (generic function with 1 method)

In [10]:
function score_word(word,freq)
    pos_freq,neg_freq = freq
    
    pos_score = get(pos_freq, word, 0) + 1 # Laplace Smoothing
    neg_score = get(neg_freq, word, 0) + 1
          
    return POS_NEG_RATIO*pos_score/neg_score
end

score_word (generic function with 1 method)

In [11]:
function predall(comment_tag_set,freq)
    store = []
    correct = 0
    for i = 1:TR_TS_SIZE
        sentence, tag = comment_tag_set[i]
        if (score_a_sent(sentence,freq) == tagclassifier(tag))
            correct += 1
        else 
            push!(store,comment_tag_set[i])
        end
    end
    return correct * 1.0 / TR_TS_SIZE,store
end

function tagclassifier(tag)
    return parse(Int64, tag) > 5 # a tag > 5 is a positive comment
end

tagclassifier (generic function with 1 method)

In [12]:
println("\nPrediction for train->")
@time acc_trn,wrongs_trn = predall(train_word_tag,freq_dicts)

println("Prediction for test->")
@time acc_tst,wrongs_tst = predall(test_word_tag,freq_dicts)


acc_trn,acc_tst


Prediction for train->
  2.309965 seconds (36.03 M allocations: 938.896 MiB, 4.82% gc time)
Prediction for test->
  2.091269 seconds (35.54 M allocations: 919.936 MiB, 4.72% gc time)


(0.92528, 0.82516)