In [2]:
library(dplyr)
library(useful)
library(Seurat)
library(patchwork)
library(cluster)
library(ggplot2)
library(randomNames)
library(tidyverse)
library(data.table)
library(lubridate)
library(ggplot2)
library(stringr)
library(monocle3)
library(reshape2)
library(pairwise)
library(scales)

In [33]:
# Define the hand-classification function 
# Input: (1) row: one row of the document metadata
# Output: (1) label of the corresponding tweet from the hand-classification criteria

hc_classify = function(row){
  
  tweetl = tolower(row$tweet)
  user_name = tolower(row$user)
  
  #1.
  junk_words = c("steve", "apple", "iphone", "itunes", "ipad", "mac", "wozniak", "gates", "dirty", "blow", "hand",
                 "whack", "nut")
  for(i in 1:length(junk_words)){
    if(grepl(junk_words[i], tweetl, fixed=TRUE)){
      return("Junk")
    }
  }
  
  #2. 
  ad_word = c("#hiring", "#jobs")
  user_ad_word = c("job", "tmj", "career")
  for(i in 1:length(ad_word)){
    if(grepl(ad_word[i], tweetl, fixed=TRUE)){
      return("Advertisement")
    }
  }
  
  for(i in 1:length(user_ad_word)){
    if(grepl(user_ad_word[i], user_name, fixed=TRUE)){
      return("Advertisement")
    }
  }
  
  #3. 
  news_word = c("obama", "clinton", "trump", "mcconnell", "ryan", "boehner", "potus", "cantor",
                "palin", "teaparty", "democrat", "repbulic", "mccain", "romney", "trade", "taxes", "senate", "president", "gop")
  user_news_word = c("news")
  for(i in 1:length(news_word)){
    if(grepl(news_word[i], tweetl, fixed=TRUE)){
      return("News/Politics")
    }
  }
  
  for(i in 1:length(user_news_word)){
    if(grepl(user_news_word[i], user_name, fixed=TRUE)){
      return("News/Politics")
    }
  }
  
  #4.
  url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
  if(grepl(url_pattern, tweetl)){
     return("Other")
  }
  
  #5.
  return("Personal")
}

In [34]:
# load hand-classified data for evaluation 
doc_metadata = read.csv("/home/rep/scRNA-seq_clustering_to_Twitter/P3_Proposed_workflow/PW_files/hand_classified_jobs.csv") %>% select(-X)

In [35]:
# run HCCA function
type = vector()
for(i in 1:nrow(doc_metadata)){
  row = doc_metadata[i,]
  type[i] = hc_classify(row)
  
}

doc_metadata = doc_metadata %>% mutate(HCCA_category = type)

In [36]:
# generate numerical labels 
doc_metadata = doc_metadata %>% dplyr::mutate(true_label = ifelse(cat == "Junk", 1, 
                                                             ifelse(cat == "Advertisement", 2,
                                                                    ifelse(cat == "News/Politics", 3, 
                                                                           ifelse(cat == "Other", 4, 5)))))
doc_metadata = doc_metadata %>% dplyr::mutate(HCCA_label = ifelse(HCCA_category == "Junk", 1, 
                                                             ifelse(HCCA_category == "Advertisement", 2,
                                                                    ifelse(HCCA_category == "News/Politics", 3, 
                                                                           ifelse(HCCA_category == "Other", 4, 5)))))

In [37]:
doc_metadata

user,tweet,cat,HCCA_category,true_label,HCCA_label
<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
Accounting jobs,"#jobs4u #jobs ACCOUNTING CLERK, [Baton Rouge, #LA] http://t.co/YbwCFQK1ma #accounting",Advertisement,Advertisement,2,2
TMJ-SAT Retail Jobs,This tweet has been removed in accordance with Twitter's policy. Twitter requires all its partners to remove tweets from their systems as soon as they are deleted on Twitter itself.,Advertisement,Advertisement,2,2
Dallas jobs,#jobs4u #jobs Construction Project Manager - Retail Division http://t.co/uxm5CfHf7m #DAL #dallas #TX,Advertisement,Advertisement,2,2
Massachusetts Jobs,"Recruiter needed in #Braintree, #MA https://t.co/0kBNf0C5lW #jobs #hiring",Advertisement,Advertisement,2,2
☆Assata☆,@CTheLily Lol oh girl I aint goin down there anytime soon ....... not until one of these jobs have contacted me ..... I'm lookin for cheap,Personal,Personal,5,5
Emerson,This tweet has been removed in accordance with Twitter's policy. Twitter requires all its partners to remove tweets from their systems as soon as they are deleted on Twitter itself.,Advertisement,Personal,2,5
Kansas City Jobs,"#KansasCity #Jobs Java.lang. Integer/3438268---: Kansas City Accountemps, a Robert Half company, is the world'... http://t.co/qy7ztfgRbD",Advertisement,Advertisement,2,2
WA Nursing Jobs,This tweet has been removed in accordance with Twitter's policy. Twitter requires all its partners to remove tweets from their systems as soon as they are deleted on Twitter itself.,Advertisement,Advertisement,2,2
Hospitality US,New #job opening at ABM Building Value in #Norwalk - #Cleaner #jobs http://t.co/AC7PhwQjJn,Advertisement,Advertisement,2,2
Caleb Denney,@kbirge Stumbled upon you because the Colts retweeted you. Read your bio. What do you mean you want to bring tech jobs to Indiana? Curious,Personal,Personal,5,5


In [38]:
# Contingency table
doc_metadata$true_category = doc_metadata$cat 
doc_metadata = doc_metadata %>% select(-cat)
df_compare = doc_metadata %>% mutate(HCCA_results = doc_metadata$HCCA_category)
organize = df_compare  %>% dplyr::count(true_category, HCCA_results)  
contingency_table = spread(organize, true_category, n, fill = 0, convert = FALSE)
contingency_table

HCCA_results,Advertisement,Junk,News/Politics,Other,Personal
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
Advertisement,294,1,6,3,4
Junk,9,75,5,0,3
News/Politics,6,0,42,1,3
Other,64,4,39,31,6
Personal,20,11,39,16,114


In [41]:
# write.csv(doc_metadata, "/home/rep/scRNA-seq_clustering_to_Twitter/P3_Proposed_workflow/PW_files/doc_metadata_stemming_valijobs_hclabeled.csv")