In [9]:
using TextAnalysis
using DataFrames
using Clustering
using Vega


In [11]:
function cleaning(tweets)
  clean_tweets = []
  for tweet in tweets
    tweet = lowercase(tweet)
    sd = StringDocument(tweet)
    remove_stop_words!(sd)
    tweet = sd.text
    tweet = replace(tweet, r"\d", " ") #remove digits
    tweet = strip(tweet) #remove leading or trailing whitespace
    tweet = replace(tweet, r" +", " ") #remove further extra whitespace
    tweet = replace(tweet, "#", " ") #remove hashtags

    # for gmos
    to_remove = ["genetically", "modify", "genetic", "modified", "modifies", "gmos", "nongmo",
     "engineers", "engineering", "engineered",  "gms", "gmo", "gm", "engineer",
     "rt", "im", "dont", "cant", "ive", "via"]
    new_tweet = []
    for word in split(tweet)
      if in(word, to_remove) == false
        push!(new_tweet, word)
      end
    end
    new_tweet = join(new_tweet, " ")
    push!(clean_tweets, new_tweet)
  end
  return clean_tweets
end

cleaning (generic function with 1 method)

In [12]:
function unigram_freq(tweet_set)
  string = " "
  for tweet in tweet_set
    string = string * tweet * " "
  end
  sd = StringDocument(string)
  unigrams = ngrams(sd, 1)
  sorted_unigrams = sort(collect(unigrams), by = tuple -> last(tuple), rev=true)
  return sorted_unigrams
end




unigram_freq (generic function with 1 method)

In [13]:
function cluster(tweets)
  # creates corpus of tweets
  tweet_doc_list = []
  for tweet in tweets
    sd = StringDocument(tweet)
    push!(tweet_doc_list, sd)
  end
  crps = Corpus(tweet_doc_list)

  wc = wordcloud(x = crps)
  colorscheme!(wc, palette = ("Spectral", 11))

  update_lexicon!(crps)

  # creates document-term matrix and TF-IDF matrix
  m = DocumentTermMatrix(crps)
  D = dtm(m, :dense)
  T = tf_idf(D)
  T_transpose = transpose(T)
  println(size(T_transpose))

  # k-means
  results = kmeans(T_transpose, 5)
  println(results.counts) #prints size of each cluster
  return results.assignments

end

cluster (generic function with 1 method)

In [18]:
  df = readtable("adderall_march6.csv")
  deleterows!(df, find(isna(df[:text])))
  deleterows!(df, find(isna(df[:user_screen_name])))
  clean_tweets = cleaning(df[:text])
  df[:clean_text] = clean_tweets

  unigrams = unigram_freq(Set(clean_tweets))

  # determines unigrams that occur less than 10 times across the list of tweets
  rare = []
  for unigram in unigrams
    if unigram[2] < 10
      push!(rare, unigram[1])
    end
  end

  # removes these rare unigrams, since they can be considered noise
  clean_tweets2 = []
  num_words = []
  for tweet in clean_tweets
    new_tweet = []
    for word in split(tweet)
      if in(word, rare) == false
        push!(new_tweet, word)
      end
    end
    push!(clean_tweets2, join(new_tweet, " "))
    push!(num_words, length(new_tweet))
  end
  df[:clean_text2] = clean_tweets2
  df[:num_words] = num_words

  # only retain tweets that have at least 3 words after cleaning
  df = df[df[:num_words] .> 3, :]

  

Unnamed: 0,text,created_at,id,user_screen_name,user_location,user_time_zone,coordinates,retweeted,in_reply_to_status_id,in_reply_to_user_id,clean_text,clean_text2,num_words
1,RT @samkriss so well just forget the time in 2009 when chelsea clinton killed two people and a dog while driving on adderall near branfo,4/26/17 18:55,8.57e17,xdolphin_life,,Central Time (US & Canada),,false,,,@samkriss forget time chelsea clinton killed people dog driving adderall near branfo,forget time people dog driving adderall,6
2,RT @jason769065061 ASSORTED PAIN and ANXIETY MEDS XANAX ADDERALL OXY NEMBUTAL VALIUM ROXY HYDRODIAZEPAM Hit me up @ 6142856223 ht,4/26/17 18:53,8.57e17,NYc37EAbRfbRE3q,,,,false,,,@jason assorted pain anxiety meds xanax adderall oxy nembutal valium roxy hydrodiazepam hit @ ht,anxiety meds xanax adderall hit @,6
3,Adderall I Aint Gon Lie I Pop Shit,4/26/17 18:52,8.57e17,87buiie,ZombieLife,,,false,,,adderall aint gon lie pop shit,adderall aint pop shit,4
4,RT @samkriss so well just forget the time in 2009 when chelsea clinton killed two people and a dog while driving on adderall near branfo,4/26/17 18:51,8.57e17,cherieinflorida,,Eastern Time (US & Canada),,false,,,@samkriss forget time chelsea clinton killed people dog driving adderall near branfo,forget time people dog driving adderall,6
5,RT @sicknanders95 if i gotta have a public adderall induced panic attack ima make it look sexy,4/26/17 18:51,8.57e17,ColleenHeberle,,,,false,,,@sicknanders gotta public adderall induced panic attack ima look sexy,gotta adderall induced attack ima look,6
6,Dead ass teacher just asked any methods on how to study for this test Some guy in the back adderall,4/26/17 18:47,8.57e17,dirtyb_,,,,false,,,dead ass teacher methods study test guy adderall,dead ass study test guy adderall,6
7,yall still trynna buy adderall off of me grow up snort a line,4/26/17 18:47,8.57e17,sakekat,hapa/native,Pacific Time (US & Canada),,false,,,yall trynna buy adderall grow snort line,yall buy adderall snort line,5
8,white girls white wine and adderall,4/26/17 18:44,8.57e17,pax_willingham,"Stephenville, TX",,,false,,,white girls white wine adderall,white girls white wine adderall,5
9,The cop Im talking to is in undercover drug enforcement and I almost just offered him some free adderall to stay awake during his shift,4/26/17 18:41,8.57e17,_toughbaby,PCB / PHILA,Pacific Time (US & Canada),,false,,,cop talking undercover drug enforcement offered free adderall stay awake shift,talking drug offered free adderall stay awake,7
10,RT @samkriss so well just forget the time in 2009 when chelsea clinton killed two people and a dog while driving on adderall near branfo,4/26/17 18:36,8.57e17,budswdrbobbillw,"Tampa, FL",,,false,,,@samkriss forget time chelsea clinton killed people dog driving adderall near branfo,forget time people dog driving adderall,6


In [29]:
crps = []
for tweet in df[:clean_text2]
    push!(crps, tweet)
end

In [33]:
wc = wordcloud(x = crps, minThreshold = 25)
colorscheme!(wc, palette = ("Spectral", 11))


In [35]:
df = readtable("gmo_march6.csv")
  deleterows!(df, find(isna(df[:text])))
  deleterows!(df, find(isna(df[:user_screen_name])))
  clean_tweets = cleaning(df[:text])
  df[:clean_text] = clean_tweets


34330-element Array{Any,1}:
 "@trendplayer china appears resisting planting crops commercial scale"                              
 "@piersmorgan shes fake booty"                                                                      
 "pei salmon breeders aquabounty indulge impulsive operation gt peipoli pei cdnpoli mcga"            
 "monsanto isnt feeding worldits killing children fd"                                                
 "china appears resisting planting crops commercial scale"                                           
 "monsanto continues lose ground worldwide except usa"                                               
 "@latifahab plants"                                                                                 
 "@allycalli cmst m topic chosen advocacy speech pro"                                                
 "@fatimahbuhaimed"                                                                                  
 "@nongmoproject worlds fish factory canada"          

In [None]:

  unigrams = unigram_freq(Set(clean_tweets))

  # determines unigrams that occur less than 10 times across the list of tweets
  rare = []
  for unigram in unigrams
    if unigram[2] < 10
      push!(rare, unigram[1])
    end
  end

  # removes these rare unigrams, since they can be considered noise
  clean_tweets2 = []
  num_words = []
  for tweet in clean_tweets
    new_tweet = []
    for word in split(tweet)
      if in(word, rare) == false
        push!(new_tweet, word)
      end
    end
    push!(clean_tweets2, join(new_tweet, " "))
    push!(num_words, length(new_tweet))
  end
  df[:clean_text2] = clean_tweets2
  df[:num_words] = num_words

  # only retain tweets that have at least 3 words after cleaning
  df = df[df[:num_words] .> 3, :]

In [None]:
crps2 = []
for tweet in df[:clean_text2]
    push!(crps2, tweet)
end

In [None]:
wc = wordcloud(x = crps, minThreshold = 25)
colorscheme!(wc, palette = ("Spectral", 11))
