# TwitterMining

Goal: Predict the animal classification of tweets (cat or dog)

## Boilerplate code

In [1]:
#import modules
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction import text as sk_fe_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, recall_score
from TwitterAPI import TwitterAPI
from IPython.display import Image

In [2]:
#Twitter api key
api = TwitterAPI('QNfuwJhdHesrXfpVmYGYx9UYi', '8ZTHg0osCzwOqxvmqwZUNI1KaWu2PI9C4tao29VeEp0UlzlTbH', auth_type = 'oAuth2')

In [3]:
def score(true,pred):
    return (precision_score(true,pred),
            recall_score(true,pred),
            f1_score(true,pred))

In [4]:
def print_score(s):
    print ("""
Precision: {:0.3}
Recall:    {:0.3}
F-SCore:   {:0.3}
""".format(*s))

In [5]:
def searchTwitter(query,feed="search/tweets",api=api,n=4000):
  r = []
  qs = 0
  if len(r)==0:
    r.extend([t for t in api.request("search/tweets",{'q':query,'count':n})])
    qs +=1
  while len(r) < n:
#     print("Querrying twitter for {}. {}/{} gathered.".format(query,len(r),n))
    last = r[-1]['id']
    r.extend([t for t in api.request("search/tweets",{'q':query,'count':n,
                                                        'max_id':last})])
    qs += 1
    if qs > 180:
      time.sleep(840)
      qs = 0
  return r[:n]

In [6]:
def clean_tweet(tweet):
    words = []
    for line in tweet.split():
        line = line.lower()
        # allow hashtags and account handles
        if line == '#dogs' or line == '#cats':
            pass
        elif line.startswith('#') or line.startswith('@'):
            # remove non-alphanumeric characters at the end of handle (if present)
            if line[-1].isalpha():
                #print('\t✅ '+ line)
                words.append(line)
            else:
                #print('\t✅ '+ line[0:-1])
                words.append(line[0:-1])
        # ignore non-alphanumeric values, links, and retweets
        elif not line.isalpha() or line.startswith('http') or line=='RT':
            #print('\t❌ ' + line)
            pass
        else:
            #print('\t✅ '+ line)
            words.append(line)

    return words

In [7]:
def update_dictionary_word_count(dictionary, words):
    for word in words:
        if word in dictionary:
            dictionary[word] = dictionary[word] + 1
        else:
            dictionary[word] = 1

In [8]:
def calc_dict_tweet_weight(tweet, dictionary):
    weight = 0
    for line in tweet.split():
        if line.startswith('#'):
            if line[-1].isalpha():
                if line in dictionary:
                    weight = weight + dictionary[line]
            else:
                if line[0:-1] in dictionary:
                    weight = weight + dictionary[line[0:-1]]
    
    return weight

## Get Tweets

> _**Note:** The code below may take a while (~45 seconds) as it is searching for Tweets_

In [9]:
#Tweets from Twitter
cats = searchTwitter('#cats')
dogs = searchTwitter('#dogs')

In [10]:
#converting json from Twitter into a dataframe
cats_d = pd.read_json(json.dumps(cats))
dogs_d = pd.read_json(json.dumps(dogs))

### #cats

In [11]:
display(cats_d)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,id_str,...,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user
0,,,2018-04-16 22:30:18,"{'hashtags': [{'text': 'CATS', 'indices': [4, ...",,0,False,,986008913060487168,986008913060487168,...,,,,0,False,,"<a href=""http://bufferapp.com"" rel=""nofollow"">...",How #CATS prism can impact clinical decision m...,False,"{'id': 705031848, 'id_str': '705031848', 'name..."
1,,,2018-04-16 22:30:11,"{'hashtags': [{'text': 'instagram', 'indices':...",,0,False,,986008880869249025,986008880869249024,...,,,,73,False,{'created_at': 'Fri Apr 13 04:23:36 +0000 2018...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @patdefranchis: @fryld @Choco_Sandy1 @sarav...,False,"{'id': 842498792, 'id_str': '842498792', 'name..."
2,,,2018-04-16 22:30:08,"{'hashtags': [{'text': 'NationalTaxDay', 'indi...","{'media': [{'id': 985658637409402880, 'id_str'...",0,False,,986008869028749313,986008869028749312,...,,,,36,False,{'created_at': 'Sun Apr 15 23:19:05 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @TheOreoCat: Intaxication 💵 #NationalTaxDay...,False,"{'id': 216507284, 'id_str': '216507284', 'name..."
3,,,2018-04-16 22:30:06,"{'hashtags': [{'text': 'cat', 'indices': [47, ...",,0,False,,986008861436821504,986008861436821504,...,,,,24,False,{'created_at': 'Mon Apr 16 21:11:05 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @MeowMiya88: Pledge=promise to donate for a...,False,"{'id': 17821530, 'id_str': '17821530', 'name':..."
4,,,2018-04-16 22:30:05,"{'hashtags': [{'text': 'cats', 'indices': [21,...",,0,False,,986008857150414850,986008857150414848,...,,,,9,False,{'created_at': 'Mon Apr 16 19:35:29 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @HelpingLostPets: #cats LOST #Providence RI...,False,"{'id': 971314556, 'id_str': '971314556', 'name..."
5,,,2018-04-16 22:29:56,"{'hashtags': [{'text': 'blackcat', 'indices': ...",,0,False,,986008819955392512,986008819955392512,...,,,,0,False,,"<a href=""https://ifttt.com"" rel=""nofollow"">IFT...","Hello human, welcome home! Let's have dinner,...",True,"{'id': 504690841, 'id_str': '504690841', 'name..."
6,,,2018-04-16 22:29:46,"{'hashtags': [{'text': 'cats', 'indices': [21,...","{'media': [{'id': 986008747624554496, 'id_str'...",0,False,,986008776351379456,986008776351379456,...,,,,0,False,,"<a href=""http://twitter.com/download/android"" ...",Adorable vs. Monster #cats #adorables https://...,False,"{'id': 631663116, 'id_str': '631663116', 'name..."
7,,,2018-04-16 22:29:37,"{'hashtags': [{'text': 'goodmorningworld', 'in...",,0,False,,986008740821336065,986008740821336064,...,,,,0,False,,"<a href=""http://instagram.com"" rel=""nofollow"">...",#goodmorningworld🌎 #atarithecat #catsofinstagr...,False,"{'id': 2731304815, 'id_str': '2731304815', 'na..."
8,,,2018-04-16 22:29:36,"{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,,986008736534810624,986008736534810624,...,,,,7,False,{'created_at': 'Mon Apr 16 22:15:48 +0000 2018...,"<a href=""http://twitter.com/download/android"" ...",RT @Catherine_Riche: SHELTON might BE KILLED T...,False,"{'id': 3081602615, 'id_str': '3081602615', 'na..."
9,,,2018-04-16 22:29:35,"{'hashtags': [{'text': 'cats', 'indices': [38,...","{'media': [{'id': 986008325983629312, 'id_str'...",0,False,,986008731593850881,986008731593850880,...,,,,0,False,,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",Fascinating Feline Fact No. 144: When #cats ju...,False,"{'id': 54925436, 'id_str': '54925436', 'name':..."


### #dogs

In [12]:
display(dogs_d)

Unnamed: 0,contributors,coordinates,created_at,entities,extended_entities,favorite_count,favorited,geo,id,id_str,...,quoted_status,quoted_status_id,quoted_status_id_str,retweet_count,retweeted,retweeted_status,source,text,truncated,user
0,,,2018-04-16 22:30:38,"{'hashtags': [{'text': 'Mondays', 'indices': [...","{'media': [{'id': 986008993641381888, 'id_str'...",0,False,,986008995776282624,986008995776282624,...,,,,0,False,,"<a href=""http://twitter.com/download/iphone"" r...",#Mondays are not Eddie’s favorite day. #Monday...,False,"{'id': 944760457, 'id_str': '944760457', 'name..."
1,,,2018-04-16 22:30:32,"{'hashtags': [], 'symbols': [], 'user_mentions...",,0,False,,986008970539216899,986008970539216896,...,,,,2,False,{'created_at': 'Mon Apr 16 01:10:18 +0000 2018...,"<a href=""https://mobile.twitter.com"" rel=""nofo...",RT @PugzillaRex: I got to hang out with this c...,False,"{'id': 228229786, 'id_str': '228229786', 'name..."
2,,,2018-04-16 22:30:21,"{'hashtags': [{'text': 'dogtraining', 'indices...","{'media': [{'id': 986008921188974595, 'id_str'...",0,False,,986008923810414593,986008923810414592,...,,,,0,False,,"<a href=""http://trainmydog.club"" rel=""nofollow...",dog care @trainmydog17 #dogtraining #dogsoftwi...,False,"{'id': 856814715531517952, 'id_str': '85681471..."
3,,"{'type': 'Point', 'coordinates': [-96.55502265...",2018-04-16 22:30:04,"{'hashtags': [{'text': 'dogs', 'indices': [89,...",,0,False,"{'type': 'Point', 'coordinates': [33.02159262,...",986008854751338497,986008854751338496,...,,,,0,False,,"<a href=""http://instagram.com"" rel=""nofollow"">...",Sometimes after a long Monday a girl just has ...,False,"{'id': 103078085, 'id_str': '103078085', 'name..."
4,,,2018-04-16 22:30:04,"{'hashtags': [{'text': 'dogs', 'indices': [31,...",,0,False,,986008852805042182,986008852805042176,...,,,,0,False,,"<a href=""http://inupple.com"" rel=""nofollow"">in...",お昼寝チュ～\nhttps://t.co/YkYvTl4CA0 #dogs,False,"{'id': 475189575, 'id_str': '475189575', 'name..."
5,,,2018-04-16 22:29:58,"{'hashtags': [{'text': 'dogs', 'indices': [0, ...",,0,False,,986008828587130886,986008828587130880,...,,,,0,False,,"<a href=""http://www.helpinglostpets.com"" rel=""...","#dogs FOUND #Parkman,Garrettsville OH,USA, Box...",False,"{'id': 208553458, 'id_str': '208553458', 'name..."
6,,,2018-04-16 22:29:52,"{'hashtags': [{'text': 'Urgente', 'indices': [...",,0,False,,986008802758717440,986008802758717440,...,,,,60,False,{'created_at': 'Mon Apr 16 15:55:12 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @PattyTwitteando: #Urgente #Rescate #Valenc...,False,"{'id': 2327712896, 'id_str': '2327712896', 'na..."
7,,,2018-04-16 22:29:29,"{'hashtags': [{'text': 'NYC', 'indices': [31, ...",,0,False,,986008707296350213,986008707296350208,...,,,,95,False,{'created_at': 'Mon Apr 16 00:06:48 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...",RT @ruthmen: 2BKILLED 04/16/18 #NYC #DOGS 2B #...,False,"{'id': 954387145585778694, 'id_str': '95438714..."
8,,,2018-04-16 22:29:22,"{'hashtags': [{'text': 'HappyBirthday', 'indic...",,0,False,,986008678401748992,986008678401748992,...,,,,0,False,,"<a href=""http://twitter.com/download/android"" ...",https://t.co/WinlLobs1a \n#HappyBirthday #dog ...,True,"{'id': 753726890, 'id_str': '753726890', 'name..."
9,,,2018-04-16 22:29:13,"{'hashtags': [{'text': 'MondayMotivation', 'in...",,0,False,,986008640946520064,986008640946520064,...,,,,18,False,{'created_at': 'Mon Apr 16 16:19:19 +0000 2018...,"<a href=""http://twitter.com"" rel=""nofollow"">Tw...","RT @sylviaselfman: It's Cats, Cupcakes &amp; K...",False,"{'id': 1307988422, 'id_str': '1307988422', 'na..."


## Attempt #1: Word Count

The first attempt will be to count the word frequency in both classes of Tweets. The hope is if there is a distinct pattern of words used for the two Tweet classes.

Properties of the cleaned Tweet include:
* Track the accounts mentioned.
* Track the hashtags (ignoring the one classification tweet).
* Omission of both '#cats' and '#dogs' in the same tweet.
* Omission of non-alphanumeric characters

Note to self: dismissing data is already putting a bias on the results. Who is to say that cat lovers don't like adding special characters?

### #cats word frequency

In [13]:
# count word frequency for '#cats'
index = 0
cats_dict = {}
sliced_tweet = []
while index < 4000:
    sliced_tweet = clean_tweet(cats_d.iloc[index]['text'])
    update_dictionary_word_count(cats_dict, sliced_tweet)
    index = index + 1

In [14]:
# print top words found in '#cats'
popular_cat_words = sorted(cats_dict.values())[::-1][:20]
for key in cats_dict.keys():
    if cats_dict[key] in popular_cat_words:
        print(key + ':' + str(cats_dict[key]))

rt:2831
#catsoftwitter:415
to:926
for:608
a:1187
#cat:543
be:368
are:342
the:979
of:503
and:490
in:421
#kitty:272
on:385
this:358
is:449
cat:289
you:463
i:301
#pets:335


### #dogs word frequency

In [15]:
# count word frequency for '#dogs'
index = 0
dogs_dict = {}
sliced_tweet = []
while index < 4000:    
    # dogs
    sliced_tweet = clean_tweet(dogs_d.iloc[index]['text'])
    update_dictionary_word_count(dogs_dict, sliced_tweet)
    index = index + 1

In [18]:
# print top words found in '#cats'
popular_dog_words = sorted(dogs_dict.values())[::-1][:20]
for key in dogs_dict.keys():
    if dogs_dict[key] in popular_dog_words:
        print(key + ':' + str(dogs_dict[key]))

rt:2689
to:812
with:497
this:447
of:580
and:858
dog:475
#puppy:279
#adopt:253
a:1054
is:744
the:1002
on:348
for:615
in:401
#pets:412
can:304
you:356
@mipooh:302
old:260


In [17]:
index = 0

cats_hit_count = 0
cats_miss_count = 0
dogs_hit_count = 0
dogs_miss_count = 0

while index < 4000:
    # cats
    cats_cats = calc_dict_tweet_weight(cats_d.iloc[index]['text'], cats_dict)
    cats_dogs = calc_dict_tweet_weight(cats_d.iloc[index]['text'], dogs_dict)
    if cats_cats > cats_dogs:
        cats_hit_count = cats_hit_count + 1
    else:
        cats_miss_count = cats_miss_count + 1

    # dogs
    dogs_cats = calc_dict_tweet_weight(dogs_d.iloc[index]['text'], cats_dict)
    dogs_dogs = calc_dict_tweet_weight(dogs_d.iloc[index]['text'], dogs_dict)
    if dogs_dogs > dogs_cats:
        dogs_hit_count = dogs_hit_count + 1
    else:
        dogs_miss_count = dogs_miss_count + 1

    index = index + 1
    
print('cats correct: ' + str(cats_hit_count))
print('cats incorrect: ' + str(cats_miss_count))
print('dogs correct: ' + str(dogs_hit_count))
print('dogs incorrect: ' + str(dogs_miss_count))

cats correct: 2360
cats incorrect: 1640
dogs correct: 2719
dogs incorrect: 1281


As the data shows, it's roughly a 50% prediction rate, which is as good as guessing. 

A better method should be to use the bag-of-words model. This way, machine learning is used to classify the 

## Attempt #2: bag-of-words with Machine Learning

> Will be using [this](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a) article for reference.