In [2]:
library(dplyr)
library(useful)
library(Seurat)
library(patchwork)
library(cluster)
library(ggplot2)
library(randomNames)
library(tidyverse)
library(data.table)
library(lubridate)
library(ggplot2)
library(stringr)
library(monocle3)
library(reshape2)
library(pairwise)
library(scales)

In [4]:
# Define the hand-classification function 
# Input: (1) row: one row of the document metadata
# Output: (1) label of the corresponding tweet from the hand-classification criteria

hc_classify = function(row){
  
  tweetl = tolower(row$text)
  user_name = tolower(row$sn)
  
  #1.
  junk_words = c("steve", "apple", "iphone", "itunes", "ipad", "mac", "wozniak", "gates", "dirty", "blow", "hand",
                 "whack", "nut")
  for(i in 1:length(junk_words)){
    if(grepl(junk_words[i], tweetl, fixed=TRUE)){
      return("Junk")
    }
  }
  
  #2. 
  ad_word = c("#hiring", "#jobs")
  user_ad_word = c("job", "tmj", "career")
  for(i in 1:length(ad_word)){
    if(grepl(ad_word[i], tweetl, fixed=TRUE)){
      return("Advertisement")
    }
  }
  
  for(i in 1:length(user_ad_word)){
    if(grepl(user_ad_word[i], user_name, fixed=TRUE)){
      return("Advertisement")
    }
  }
  
  #3. 
  news_word = c("obama", "clinton", "trump", "mcconnell", "ryan", "boehner", "potus", "cantor",
                "palin", "teaparty", "democrat", "repbulic", "mccain", "romney", "trade", "taxes", "senate", "president", "gop")
  user_news_word = c("news")
  for(i in 1:length(news_word)){
    if(grepl(news_word[i], tweetl, fixed=TRUE)){
      return("News/Politics")
    }
  }
  
  for(i in 1:length(user_news_word)){
    if(grepl(user_news_word[i], user_name, fixed=TRUE)){
      return("News/Politics")
    }
  }
  
  #4.
  url_pattern <- "http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"
  if(grepl(url_pattern, tweetl)){
     return("Other")
  }
  
  #5.
  return("Personal")
}

In [5]:
# load Doc_metadata
doc_metadata = read.csv("/home/rep/scRNA-seq_clustering_to_Twitter/P1_preprocessing/doc_metadata_stemming_jobs.csv") %>% select(-X)
dim(doc_metadata)
head(doc_metadata)

Unnamed: 0_level_0,time,text,sn,date,tweets_processed
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
1,2009-08-01 10:25:36,Now Hiring: Storage Architect II http://bit.ly/4iACm Jobs,ChicagoJobAds,2009-08-01,hire architect ii
2,2009-08-01 22:57:06,"""The Steve Jobs method"" discussion on Hacker News: http://bit.ly/1b55N8 (via @ericries)",hnshah,2009-08-01,steve discuss news via
3,2009-08-01 23:27:08,"AZ Jobs | Taco Bell Restaurant General Manager at Taco Bell (Peoria, AZ) http://bit.ly/18jiLD #Job #Hiring #AZJobs",ZuluJobsAZ,2009-08-01,az taco bell restaur gener manag taco bell peoria az hire azjob
4,2009-08-01 09:55:12,"TN Jobs | SLP Travel Job in Knoxville Area, TN School System at Soliant Health (Kn.. http://u.mavrev.com/wuin #Job #Hiring #TNJobs",ZuluJobsTN,2009-08-01,tn slp travel knoxvil area tn school system soliant health hire tnjob
5,2009-08-01 05:58:39,NJ Jobs | New Jersey Travel or Perm job- OT at Sunbelt Staffing (NJ) http://bit.ly/yZzHR #Job #Hiring #NJJobs,ZuluJobsNJ,2009-08-01,nj new jersey travel perm ot sunbelt staf nj hire njjob
6,2009-08-01 13:27:47,NJ Jobs | New Jersey Travel or Perm job- OT at Sunbelt Staffing (NJ) http://bit.ly/3Mu1Wz #Job #Hiring #NJJobs,ZuluJobsNJ,2009-08-01,nj new jersey travel perm ot sunbelt staf nj hire njjob


In [8]:
# run HCCA function
type = vector()
for(i in 1:nrow(doc_metadata)){
  row = doc_metadata[i,]
  type[i] = hc_classify(row)
}

doc_metadata = doc_metadata %>% mutate(category = type)

In [9]:
# generate numerical labels 
doc_metadata = doc_metadata %>% dplyr::mutate(label = ifelse(category == "Junk", 1, 
                                                             ifelse(category == "Advertisement", 2,
                                                                    ifelse(category == "News/Politics", 3, 
                                                                           ifelse(category == "Other", 4, 5)))))
doc_metadata %>% dplyr::count(category)
doc_metadata %>% dplyr::count(label)

category,n
<chr>,<int>
Advertisement,12072
Junk,2127
News/Politics,1024
Other,7414
Personal,5107


label,n
<dbl>,<int>
1,2127
2,12072
3,1024
4,7414
5,5107


In [10]:
str(doc_metadata)

'data.frame':	27744 obs. of  7 variables:
 $ time            : chr  "2009-08-01 10:25:36" "2009-08-01 22:57:06" "2009-08-01 23:27:08" "2009-08-01 09:55:12" ...
 $ text            : chr  "Now Hiring:  Storage Architect II http://bit.ly/4iACm  Jobs" "\"The Steve Jobs method\" discussion on Hacker News: http://bit.ly/1b55N8 (via @ericries)" "AZ Jobs | Taco Bell Restaurant General Manager at Taco Bell (Peoria, AZ) http://bit.ly/18jiLD #Job #Hiring #AZJobs" "TN Jobs | SLP Travel Job in Knoxville Area, TN School System at Soliant Health (Kn.. http://u.mavrev.com/wuin #"| __truncated__ ...
 $ sn              : chr  "ChicagoJobAds" "hnshah" "ZuluJobsAZ" "ZuluJobsTN" ...
 $ date            : chr  "2009-08-01" "2009-08-01" "2009-08-01" "2009-08-01" ...
 $ tweets_processed: chr  "hire architect ii" "steve discuss news via" "az taco bell restaur gener manag taco bell peoria az hire azjob" "tn slp travel knoxvil area tn school system soliant health hire tnjob" ...
 $ category        : chr  "Adverti

In [12]:
# write.csv(doc_metadata, "/home/rep/scRNA-seq_clustering_to_Twitter/P3_Proposed_workflow/PW_files/doc_metadata_stemming_jobs_hclabeled.csv")