In [1]:
# Import libraries
library(tm) # for NLP
library(plyr) #  for pre-processing 
library(tidyverse) # for pre-processing and visualisation
library(reshape2) # for melt function
library(glmnet) # for Logistic Regression classifier
#library(tuple)

Loading required package: NLP

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.2.1     [32m✔[39m [34mpurrr  [39m 0.3.3
[32m✔[39m [34mtibble [39m 2.1.3     [32m✔[39m [34mdplyr  [39m 0.8.4
[32m✔[39m [34mtidyr  [39m 1.0.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.3.1     [32m✔[39m [34mforcats[39m 0.4.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mggplot2[39m::[32mannotate()[39m masks [34mNLP[39m::annotate()
[31m✖[39m [34mdplyr[39m::[32marrange()[39m    masks [34mplyr[39m::arrange()
[31m✖[39m [34mpurrr[39m::[32mcompact()[39m    masks [34mplyr[39m::compact()
[31m✖[39m [34mdplyr[39m::[32mcount()[39m      masks [34mplyr[39m::count()
[31m✖[39m [34mdplyr[39m::[32mfailwith()[39m   masks [34mplyr[39m::failwith()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m     masks [34ms

In [2]:
BigramTokenizer <-
  function(x)
    unlist(lapply(ngrams(words(x), 2), paste, collapse = " "), use.names = FALSE)

In [3]:
hyper_pamaters_lg_bi <-function(alpha_gr,lambda_gr,train_x,train_y,val_x,val_y){
    accuracy_list <- data.frame("alpha" = 100, "lambda" = 100, "accuracy" =100 )
    
    for (i in alpha_gr){
        for(j in lambda_gr){
            glm_ = glmnet(train_x,train_y, family = "binomial",alpha = i,lambda= j)
            predicted_glm_body <- predict(glm_, val_x, type = "class")
            accuracy_glm_body <- sum(val_y == predicted_glm_body )/ length(predicted_glm_body)
            accuracy_list = rbind(accuracy_list,list(i,j,accuracy_glm_body))
            
            
        }
        
    }
    acc = (accuracy_list[-1,])
    acc_ord = acc[order(-acc$accuracy),]
    return (acc_ord )
}

In [4]:
clean_text <- function(x){ 
  gsub("…|⋆|–|‹|”|“|‘|’", " ", x) 
}

preprocess_corpus <- function(corpus){
  # Convert the text to lower case
  corpus <- tm_map(corpus, content_transformer(tolower))
  # Remove numbers
  corpus <- tm_map(corpus, removeNumbers)
  # Remove punctuations
  corpus <- tm_map(corpus, removePunctuation)
  # Remove special characters from text
  corpus <- tm_map(corpus, clean_text)
  # Remove english common stopwords
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  # Remove name of newspapers from the corpus
  corpus <- tm_map(corpus, removeWords, c("eagle rising","freedom daily"))
  # 'stem' words to root words
  corpus <- tm_map(corpus,stemDocument)
  # Eliminate extra white spaces
  corpus <- tm_map(corpus, stripWhitespace)
  return (corpus)
}

In [6]:
return_files_lr<-function(s,real_data,fake_data,Type){
    
    real_df <- read_csv(real_data)
    fake_df <- read_csv(fake_data)
    df = rbind(real_df, fake_df)
    df$type <- sapply(strsplit(df$id, "_"), head,  1)
    set.seed(s)
    fractionTraining   <- 0.60
    fractionValidation <- 0.20
    fractionTest       <- 0.20
    
    sampleSizeTraining   <- ceiling(fractionTraining   * nrow(df))
    sampleSizeValidation <- floor(fractionValidation * nrow(df))
    sampleSizeTest       <- floor(fractionTest       * nrow(df))
    
    indicesTraining_s    <- sort(sample(seq_len(nrow(df)), size=sampleSizeTraining))
    indicesNotTraining <- setdiff(seq_len(nrow(df)), indicesTraining_s)
    indicesValidation_s  <- sort(sample(indicesNotTraining, size=sampleSizeValidation))
    indicesTest_s        <- setdiff(indicesNotTraining, indicesValidation_s)
    indices_train_val_s = sort(append(indicesTraining_s,indicesValidation_s  ))

    indicesTraining = sample(indicesTraining_s, size=sampleSizeTraining )
    indicesTest = sample(indicesTest_s, size=sampleSizeTest )
    indicesValidation = sample(indicesValidation_s, size=sampleSizeValidation )
    indices_train_val = sample(indices_train_val_s)

    dfTraining   <- df[indicesTraining, ]
    dfValidation <- df[indicesValidation, ]
    dfTest       <- df[indicesTest, ]
    dfTrainVal     <- df[indices_train_val, ]
    a <- paste(Type,(sapply((strsplit(dfTraining$id, "-")), head,  1)),sep="_")
    b <- dfTraining$type == 'Fake'
    c <- b*1
    a_val <- paste(Type,(sapply((strsplit(dfValidation$id, "-")), head,  1)),sep="_")
    b_val <- dfValidation$type == 'Fake'
    c_val <- b_val*1
    
    a_t <- paste(Type,(sapply((strsplit(dfTest$id, "-")), head,  1)),sep="_")
    b_t <- dfTest$type == 'Fake'
    c_t <- b_t*1
    

    main_dir = '/Users/RAJ/Desktop/linqs/kaggle_fakenews/fakenewsnet'
    news_dir = paste(main_dir,Type,sep="/")
    fold = paste(s,'fold',sep='_')
    dir.create(paste(news_dir,fold,sep='/'), showWarnings = TRUE)
    fold_dir = paste(news_dir,fold,sep='/')
    dir.create(paste(fold_dir,'Eval',sep='/'), showWarnings = TRUE)
    dir.create(paste(fold_dir,'Learn',sep='/'), showWarnings = TRUE)
    dir.create(paste(fold_dir,'acc_ind',sep='/'), showWarnings = TRUE)
    Eval_dir = paste(fold_dir,'Eval',sep='/')
    Learn_dir = paste(fold_dir,'Learn',sep='/')
    ind_dir = paste(fold_dir,'acc_ind',sep='/')
    df_ = cbind(a,c)
    df_val = cbind(a_val,c_val)
    df_truth = cbind(a_t,c_t)
    df_obs = rbind(df_,df_val)

    write.table(df_obs,paste(Eval_dir,paste(Type,'Obs.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(a_t,paste(Eval_dir,paste(Type,'Target.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(df_truth,paste(Eval_dir,paste(Type,'Truth.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)

    write.table(df_,paste(Learn_dir,paste(Type,'Obs.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(a_val,paste(Learn_dir,paste(Type,'Target.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(df_val,paste(Learn_dir,paste(Type,'Truth.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    
    write.table(indicesTraining,paste(ind_dir,'train_ind.txt',sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(indicesValidation,paste(ind_dir,'val_ind.txt',sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(indicesTest,paste(ind_dir,'test_ind.txt',sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    
    # corpus for bigrams
    corpus <- VCorpus(VectorSource(df$text))
    # corpus to document term matrix of bigrams
    bigram_matrix_b <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
    # sort frequency of bigrams in decreasing order to give high frequency phrases
    bigram_freq_b <- sort(colSums(as.matrix(bigram_matrix_b)), decreasing=TRUE)
    sparse_body_dtm <- removeSparseTerms(bigram_matrix_b, .97) # 750 terms
    body_dtm<- as.matrix(sparse_body_dtm)
    y_train <- as.matrix(dfTraining$type)
    x_train <- body_dtm[indicesTraining, ]
    y_val <- as.matrix(dfValidation$type)
    x_val <- body_dtm[indicesValidation, ]
    y_test <- as.matrix(dfTest$type)
    x_test <- body_dtm[indicesTest, ]
    y_train_val <- as.matrix(dfTrainVal$type)
    x_train_val <- body_dtm[indices_train_val, ]
    y_true <- as.matrix(df$type)
    
    alpha_grid <- seq(0, 1, 0.05)
    lambda_grid <- seq(0, 1, 0.05)
    acc_bi <- hyper_pamaters_lg_bi(alpha_grid,lambda_grid,x_train,y_train,x_val,y_val)
    acc_bim = data.matrix(acc_bi)
    al_bi = acc_bim[1,1]
    lam_bi = acc_bim[1,2]
    glm_bi = glmnet(x_train,y_train, family = "binomial",alpha = al_bi,lambda= lam_bi)
    predicted_glm_val_bi <- 1-predict(glm_bi, x_val, type = "response")
    predicted_glm_val_bi_a <- predict(glm_bi, x_val, type = "class")
    accuracy_glm_val_bi <- sum(y_val == predicted_glm_val_bi_a)/ length(predicted_glm_val_bi_a)
    df_LR_bi = cbind(a_val,predicted_glm_val_bi)
    glm_bi_2 = glmnet(x_train_val,y_train_val, family = "binomial",alpha = al_bi,lambda= lam_bi)
    predicted_glm_test_bi <- 1 - predict(glm_bi_2, x_test, type = "response")
    predicted_glm_test_bi_a <- predict(glm_bi_2, x_test, type = "class")
    accuracy_glm_test_bi <- sum(y_test == predicted_glm_test_bi_a )/ length(predicted_glm_test_bi_a)
    df_LR_bi_test = cbind(a_t,predicted_glm_test_bi )
        # Buzzfeed text body corpus
    body_corpus_uni <- Corpus(VectorSource(df$text))
    # convert body corpus to document term matrix
    body_dtm_uni <- DocumentTermMatrix(preprocess_corpus(body_corpus_uni))
    sparse_body_dtm_uni <- removeSparseTerms(body_dtm_uni, 0.97) 
    body_dtm_uni<- as.matrix(sparse_body_dtm_uni)
    y_train_uni <- as.matrix(dfTraining$type)
    x_train_uni <- body_dtm_uni[indicesTraining, ]
    y_val_uni <- as.matrix(dfValidation$type)
    x_val_uni <- body_dtm_uni[indicesValidation, ]
    y_test_uni <- as.matrix(dfTest$type)
    x_test_uni <- body_dtm_uni[indicesTest, ]
    y_train_val_uni <- as.matrix(dfTrainVal$type)
    x_train_val_uni <- body_dtm_uni[indices_train_val, ]
    y_true_uni <- as.matrix(df$type)
    acc_uni <- hyper_pamaters_lg_bi(alpha_grid,lambda_grid,x_train_uni,y_train_uni,x_val_uni,y_val_uni)
    acc_unim = data.matrix(acc_uni)
    al_uni = acc_unim[1,1]
    lam_uni = acc_unim[1,2]
    glm_uni = glmnet(x_train_uni,y_train_uni, family = "binomial",alpha = al_uni,lambda= lam_uni)
    predicted_glm_val_uni <- 1-predict(glm_uni, x_val_uni, type = "response")
    predicted_glm_val_uni_a <- predict(glm_uni, x_val_uni, type = "class")
    accuracy_glm_val_uni <- sum(y_val == predicted_glm_val_uni_a )/ length(predicted_glm_val_uni_a)
    df_LR_uni = cbind(a_val,predicted_glm_val_uni)
    glm_uni_2 = glmnet(x_train_val_uni,y_train_val_uni, family = "binomial",alpha = al_uni,lambda= lam_uni)
    predicted_glm_test_uni <- 1-predict(glm_uni_2, x_test_uni, type = "response")
    predicted_glm_test_uni_a <- predict(glm_uni_2, x_test_uni, type = "class")
    accuracy_glm_test_uni <- sum(y_test_uni == predicted_glm_test_uni_a )/ length(predicted_glm_test_uni_a)
    df_LR_uni_test = cbind(a_t,predicted_glm_test_uni)
    
    accuracy_list_bi <- data.frame("alpha_bi" = al_bi, "lambda_bi" = lam_bi, "accuracy_val" =accuracy_glm_val_bi,"accuracy_test"=accuracy_glm_test_bi)
    accuracy_list_uni <- data.frame("alpha_uni" = al_uni, "lambda_uni" = lam_uni, "accuracy_val_uni" =accuracy_glm_val_uni,"accuracy_test_uni"=accuracy_glm_test_uni)
    accuracy_list <- cbind(accuracy_list_bi, accuracy_list_uni)
  
    write.table(df_LR_uni,paste(Learn_dir,paste(Type,'LRUniPred.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(df_LR_bi,paste(Learn_dir,paste(Type,'LRBiPred.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(df_LR_bi_test,paste(Eval_dir,paste(Type,'LRBiPred.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    #write.table(df_LR_bi_test,paste(Eval_dir,paste(Type,'LRBiPred.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    write.table(df_LR_uni_test,paste(Eval_dir,paste(Type,'LRUniPred.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)
    

    write.table(accuracy_list,paste(ind_dir,'accuracy_hyp.txt',sep='/'),sep="\t",row.names=FALSE)
    
}

In [7]:
 #write.table(df_obs,paste(.,paste('ble','Obs.txt',sep=''),sep='/'),sep="\t",row.names=FALSE,col.names = FALSE,quote=FALSE)

ERROR: Error in is.data.frame(x): object 'df_obs' not found


In [7]:
for (i in 31:60){
    return_files_lr(i,'BuzzFeed_real_news_content.csv','BuzzFeed_fake_news_content.csv','BuzzFeed')
    }

Parsed with column specification:
cols(
  id = [31mcol_character()[39m,
  title = [31mcol_character()[39m,
  text = [31mcol_character()[39m,
  url = [31mcol_character()[39m,
  top_img = [31mcol_character()[39m,
  authors = [31mcol_character()[39m,
  source = [31mcol_character()[39m,
  publish_date = [31mcol_character()[39m,
  movies = [31mcol_character()[39m,
  images = [31mcol_character()[39m,
  canonical_link = [31mcol_character()[39m,
  meta_data = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  id = [31mcol_character()[39m,
  title = [31mcol_character()[39m,
  text = [31mcol_character()[39m,
  url = [31mcol_character()[39m,
  top_img = [31mcol_character()[39m,
  authors = [31mcol_character()[39m,
  source = [31mcol_character()[39m,
  publish_date = [31mcol_character()[39m,
  movies = [31mcol_character()[39m,
  images = [31mcol_character()[39m,
  canonical_link = [31mcol_character()[39m,
  meta_data = [31mcol_charac

In [8]:
for (i in 31:60){
    return_files_lr(i,'PolitiFact_real_news_content.csv','PolitiFact_fake_news_content.csv','PolitiFact')
    }

Parsed with column specification:
cols(
  id = [31mcol_character()[39m,
  title = [31mcol_character()[39m,
  text = [31mcol_character()[39m,
  url = [31mcol_character()[39m,
  top_img = [31mcol_character()[39m,
  authors = [31mcol_character()[39m,
  source = [31mcol_character()[39m,
  publish_date = [31mcol_character()[39m,
  movies = [31mcol_character()[39m,
  images = [31mcol_character()[39m,
  canonical_link = [31mcol_character()[39m,
  meta_data = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  id = [31mcol_character()[39m,
  title = [31mcol_character()[39m,
  text = [31mcol_character()[39m,
  url = [31mcol_character()[39m,
  top_img = [31mcol_character()[39m,
  authors = [31mcol_character()[39m,
  source = [31mcol_character()[39m,
  publish_date = [31mcol_character()[39m,
  movies = [31mcol_character()[39m,
  images = [31mcol_character()[39m,
  canonical_link = [31mcol_character()[39m,
  meta_data = [31mcol_charac

In [13]:
real_df_test <- read_csv('PolitiFact_real_news_content.csv')
fake_df_test <- read_csv('PolitiFact_fake_news_content.csv')

Parsed with column specification:
cols(
  id = [31mcol_character()[39m,
  title = [31mcol_character()[39m,
  text = [31mcol_character()[39m,
  url = [31mcol_character()[39m,
  top_img = [31mcol_character()[39m,
  authors = [31mcol_character()[39m,
  source = [31mcol_character()[39m,
  publish_date = [31mcol_character()[39m,
  movies = [31mcol_character()[39m,
  images = [31mcol_character()[39m,
  canonical_link = [31mcol_character()[39m,
  meta_data = [31mcol_character()[39m
)

Parsed with column specification:
cols(
  id = [31mcol_character()[39m,
  title = [31mcol_character()[39m,
  text = [31mcol_character()[39m,
  url = [31mcol_character()[39m,
  top_img = [31mcol_character()[39m,
  authors = [31mcol_character()[39m,
  source = [31mcol_character()[39m,
  publish_date = [31mcol_character()[39m,
  movies = [31mcol_character()[39m,
  images = [31mcol_character()[39m,
  canonical_link = [31mcol_character()[39m,
  meta_data = [31mcol_charac

In [14]:
length(real_df_test$id)