In [450]:
library(e1071)
library(dplyr)
library(caret)

In [451]:
docs_train_unigrams = read.csv('docs_train_unigrams.csv')
docs_train_bigrams = read.csv('docs_train_bigrams.csv')

In [452]:
docs_train = merge(docs_train_unigrams, docs_train_bigrams, by=c("Doc_id","Class_Label"))
docs_train = subset(docs_train, select = -c(Doc_id))
docs_train = docs_train %>% group_by(Class_Label) %>% sample_n(size = 500)
head(docs_train)

Class_Label,abc,abl,across,act,action,actual,affect,ago,ahead,...,world,year,yearold,yesterday,yet,last.week,last.year,new.south,per.cent,south.wale
C1,0,0,0.0,0,0,0,0,0.0,0,...,0.0,0.04338176,0.1301929,0,0,0,0,0.0,3.410177,0.0
C1,0,0,0.0,0,0,0,0,0.0,0,...,0.0,0.04609312,0.0,0,0,0,0,1.759354,0.0,1.76478
C1,0,0,0.7546878,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0
C1,0,0,0.0,0,0,0,0,0.04745019,0,...,0.03445453,0.03641925,0.0,0,0,0,0,0.0,0.0,0.0
C1,0,0,0.0,0,0,0,0,0.0,0,...,0.0,0.0,0.1967359,0,0,0,0,0.0,0.0,0.0
C1,0,0,0.0,0,0,0,0,0.0,0,...,0.0,0.0,0.0,0,0,0,0,0.0,0.0,0.0


In [453]:
docs_train[, !names(docs_train) %in% c("Class_Label")] <- scale(docs_train[, !names(docs_train) %in% c("Class_Label")])

In [454]:
sample_ind = sample(nrow(docs_train), 0.8*nrow(docs_train), replace = FALSE)
sample_train = docs_train[sample_ind,]
sample_test = docs_train[-sample_ind,]

In [473]:
dim(sample_train)
dim(sample_test)
dim(docs_train)

In [456]:
sample_train[["Class_Label"]] = factor(sample_train[["Class_Label"]])

In [457]:
svm.fit <- svm(Class_Label ~ . , sample_train, kernel = "radial", cost = 2)

In [458]:
sample_train$predicted_label_radial <- predict(svm.fit, sample_train)

In [459]:
sample_test$predicted_label_radial <- predict(svm.fit, sample_test)

In [460]:
# Train Accuracy
mean(sample_train$Class_Label == sample_train$predicted_label_radial)

In [461]:
# Test Accuracy
mean(sample_test$Class_Label == sample_test$predicted_label_radial)

In [462]:
# Confusion Matrix for train data
conf_train = table(sample_train$Class_Label, sample_train$predicted_label_radial)

In [463]:
# Confusion Matrix for test data
conf_test = table(sample_test$Class_Label, sample_test$predicted_label_radial)

In [464]:
N <- nrow(sample_test)
diag = diag(conf_test)
Accuracy = sum(diag)/N
round(Accuracy*100, 2) # accuracy

In [465]:
rowsums = apply(conf_test, 1, sum)
colsums = apply(conf_test, 2, sum)
Precision = diag/ colsums
Precision = round(Precision, 2)

Recall = diag / rowsums
Recall = round(Recall, 2)

mean(Precision)
mean(Recall)

In [466]:
f.score = (2*Precision*Recall)/(Precision + Recall)
mean(f.score)

In [467]:
docs_test_unigrams = read.csv('docs_test_unigrams.csv')
docs_test_bigrams = read.csv('docs_test_bigrams.csv')

In [468]:
docs_test = merge(docs_test_unigrams, docs_test_bigrams, by=c("Doc_id","Class_Label"))
docs_test = subset(docs_test, select = -c(Doc_id,Class_Label))

In [469]:
docs_test <- data.frame(scale(docs_test))

In [470]:
docs_test$predicted_label_radial <- predict(svm.fit, docs_test)

In [471]:
head(docs_test)

abc,abl,across,act,action,actual,affect,ago,ahead,allow,...,year,yearold,yesterday,yet,last.week,last.year,new.south,per.cent,south.wale,predicted_label_radial
-0.1348845,-0.2111811,-0.1920004,-0.1657907,-0.1519612,-0.2055182,-0.1683103,-0.2008102,-0.1510989,-0.1849399,...,2.847705,-0.2752462,-0.2392608,-0.1787248,-0.2244964,3.6981929,-0.299216,-0.3092593,-0.2990043,C1
-0.1348845,-0.2111811,-0.1920004,-0.1657907,-0.1519612,-0.2055182,-0.1683103,-0.2008102,-0.1510989,4.2932118,...,-0.4801061,-0.2752462,-0.2392608,-0.1787248,-0.2244964,-0.2949162,-0.299216,-0.3092593,-0.2990043,C13
-0.1348845,-0.2111811,-0.1920004,-0.1657907,-0.1519612,-0.2055182,-0.1683103,-0.2008102,-0.1510989,-0.1849399,...,0.6795857,1.6911909,-0.2392608,-0.1787248,-0.2244964,-0.2949162,-0.299216,-0.3092593,-0.2990043,C1
-0.1348845,-0.2111811,-0.1920004,-0.1657907,-0.1519612,-0.2055182,-0.1683103,-0.2008102,-0.1510989,-0.1849399,...,-0.4801061,-0.2752462,-0.2392608,-0.1787248,-0.2244964,-0.2949162,-0.299216,-0.3092593,-0.2990043,C4
-0.1348845,2.3633394,-0.1920004,-0.1657907,-0.1519612,-0.2055182,-0.1683103,-0.2008102,-0.1510989,-0.1849399,...,0.8171762,-0.2752462,-0.2392608,-0.1787248,-0.2244964,-0.2949162,3.420736,-0.3092593,3.4620819,C2
-0.1348845,-0.2111811,0.7892066,-0.1657907,-0.1519612,-0.2055182,-0.1683103,-0.2008102,-0.1510989,-0.1849399,...,1.24376,-0.2752462,-0.2392608,-0.1787248,-0.2244964,3.6981929,-0.299216,-0.3092593,-0.2990043,C2


In [472]:
unique(docs_test$predicted_label_radial)