## Loading Libraries

In [None]:
require(tm)
require(SnowballC)
require(RWeka)

## Read datasets

In [None]:
# reading the data
read_data <- function(file_name.txt) {

    # Read the data
    docs = readLines(file_name.txt)
    docs = strsplit(docs, '\n')
    
    ids = c()
    texts = c()
    for(line in docs){
        if(length(line)>0){

            if(grepl('ID tr_doc|ID te_doc', line)){
                id = grep('ID tr_doc|ID te_doc', line, value = TRUE)
                id_val = strsplit(id, ' ')[[1]][2]
                ids = c(ids, id_val)
            }
            else if(grepl('^TEXT',line)){
                text = grep('TEXT ', line, value = TRUE)
                text_val = sub("^\\w+\\s+", "", text)
                texts = c(texts, text_val)
            }   
        }    
    }
    docs <- data.frame("doc_id"= ids,"text"=texts)
    docs$text <- as.character(docs$text)
    return(docs)
    }

In [None]:
# preprocessing the data
pre_processing <- function(docs, ngram_min = 1, ngram_max = 1) {
#     texts <- data.frame(unlist(docs$texts))
    ids <- docs$doc_id
    # create a corpus
    docs <- DataframeSource(docs)
    corp <- Corpus(docs)
    ndocs <- length(corp)
    minDocFreq <- ndocs * 0.05
    maxDocFreq <- ndocs * 0.95
    # Preprocessing:
    corp <- tm_map(corp,tolower) # converting all words into lower case
    corp <- tm_map(corp, removeWords, stopwords("english")) # remove stop words (the most common word in a language that can be find in any document)
    corp <- tm_map(corp, removePunctuation) # remove pnctuation
    corp <- tm_map(corp, stemDocument) # perform stemming (reducing inflected and derived words to their root form)
    corp <- tm_map(corp, removeNumbers) # remove all numbers
    corp <- tm_map(corp, stripWhitespace) # remove redundant spaces 
    corp <- tm_map(corp, PlainTextDocument)
    
    BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = ngram_min, max = ngram_max))
    # Create a matrix which its rows are the documents and colomns are the words. 
    dtm <- DocumentTermMatrix(corp, control = list(weighting = weightTfIdf, tokenize = BigramTokenizer,bounds = list(global = c(minDocFreq, maxDocFreq))))
    ## reduce the sparcity of out dtm
#     dtm <- removeSparseTerms(dtm,0.95)
    ## convert dtm to a matrix
    word.doc.mat <- t(as.matrix(dtm))
    colnames(word.doc.mat) <- ids
    word.doc.mat = as.data.frame(t(word.doc.mat))
    word.doc.mat['Doc_id'] = rownames(word.doc.mat)
    
    # Return the result
    return (word.doc.mat)
    }

In [None]:
docs_train = read_data('training_docs.txt')
docs_test = read_data('testing_docs.txt')
docs_train_labels = readLines('training_labels_final.txt')

In [None]:
dim(docs_train)
dim(docs_test)

In [None]:
docs_combine = rbind(docs_train,docs_test)
dim(docs_combine)

In [None]:
head(docs_combine)

In [64]:
options(warn=-1)
docs_final = pre_processing(docs_combine,ngram_min = 1, ngram_max = 1)

In [65]:
options(warn=-1)
docs_final_bigrams = pre_processing(docs_combine,ngram_min = 2, ngram_max = 2)

In [80]:
head(docs_final)

Unnamed: 0,abc,abl,across,act,action,actual,affect,ago,ahead,allow,...,within,without,won,work,world,year,yearold,yesterday,yet,Doc_id
tr_doc_1,0,0,0,0.0,0,0.0,0,0,0,0,...,0,0.0,0,0.0,0,0,0.0,0.0,0,tr_doc_1
tr_doc_2,0,0,0,0.3339993,0,0.1708492,0,0,0,0,...,0,0.0,0,0.0,0,0,0.12296,0.0,0,tr_doc_2
tr_doc_3,0,0,0,0.0,0,0.0,0,0,0,0,...,0,0.0,0,0.0,0,0,0.1553178,0.0,0,tr_doc_3
tr_doc_4,0,0,0,0.0,0,0.0,0,0,0,0,...,0,0.133985,0,0.0,0,0,0.0,0.1209514,0,tr_doc_4
tr_doc_5,0,0,0,0.0,0,0.0,0,0,0,0,...,0,0.0,0,0.0,0,0,0.0,0.0,0,tr_doc_5
tr_doc_6,0,0,0,0.04356513,0,0.0,0,0,0,0,...,0,0.0,0,0.0268918,0,0,0.0,0.0,0,tr_doc_6


In [72]:
doc_ids = c()
class_lab = c()

for(line in docs_train_labels){
    doc_ids = c(doc_ids, strsplit(line, ' ')[[1]][1])
    class_lab = c(class_lab, strsplit(line, ' ')[[1]][2])
}

docs_train_labels = data.frame('Doc_id' = doc_ids, 'Class_Label' = class_lab)

Doc_id,Class_Label
tr_doc_1,C1
tr_doc_2,C1
tr_doc_3,C1
tr_doc_4,C1
tr_doc_5,C1
tr_doc_6,C1
tr_doc_7,C1
tr_doc_8,C1
tr_doc_9,C1
tr_doc_10,C1


In [81]:
library(dplyr)
docs_final_labels = full_join(docs_final, docs_train_labels, by="Doc_id")
docs_final_bigrams_labels = full_join(docs_final_bigrams, docs_train_labels, by="Doc_id")

In [95]:
dim(docs_final_labels)

In [98]:
docs_train_unigrams <- docs_final_labels[!is.na(docs_final_labels$Class_Label), ]
docs_test_unigrams <- docs_final_labels[is.na(docs_final_labels$Class_Label), ]

docs_train_bigrams <- docs_final_bigrams_labels[!is.na(docs_final_bigrams_labels$Class_Label), ]
docs_test_bigrams <- docs_final_bigrams_labels[is.na(docs_final_bigrams_labels$Class_Label), ]

In [104]:
dim(docs_train_unigrams)
dim(docs_test_unigrams)
dim(docs_train_bigrams)
dim(docs_test_bigrams)

In [106]:
head(docs_test_unigrams)


Unnamed: 0,abc,abl,across,act,action,actual,affect,ago,ahead,allow,...,without,won,work,world,year,yearold,yesterday,yet,Doc_id,Class_Label
106446,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,...,0,0,0,0,0.12825911,0,0,0,te_doc_1,
106447,0,0.08952438,0,0,0.0,0,0.0,0.08938291,0,0.0,...,0,0,0,0,0.06860371,0,0,0,te_doc_2,
106448,0,0.0,0,0,0.144536,0,0.0,0.13253328,0,0.0,...,0,0,0,0,0.05086137,0,0,0,te_doc_3,
106449,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,...,0,0,0,0,0.0,0,0,0,te_doc_4,
106450,0,0.0,0,0,0.0,0,0.1302037,0.0,0,0.1206373,...,0,0,0,0,0.0,0,0,0,te_doc_5,
106451,0,0.0,0,0,0.0,0,0.0,0.0,0,0.0,...,0,0,0,0,0.0,0,0,0,te_doc_6,


# Writing to CSV

In [103]:
write.csv(docs_train_unigrams, file = "docs_train_unigrams.csv",row.names = FALSE)
write.csv(docs_test_unigrams, file = "docs_test_unigrams.csv",row.names = FALSE)
write.csv(docs_train_bigrams, file = "docs_train_bigrams.csv",row.names = FALSE)
write.csv(docs_test_bigrams, file = "docs_test_bigrams.csv",row.names = FALSE)