# TwitterMining

Goal: Predict the animal classification of tweets (cat or dog)

## Boilerplate code

In [1]:
#import modules
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.feature_extraction import text as sk_fe_text
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, f1_score, recall_score
from TwitterAPI import TwitterAPI
from IPython.display import Image

In [2]:
#Twitter api key
api = TwitterAPI('QNfuwJhdHesrXfpVmYGYx9UYi', '8ZTHg0osCzwOqxvmqwZUNI1KaWu2PI9C4tao29VeEp0UlzlTbH', auth_type = 'oAuth2')

In [3]:
def score(true,pred):
    return (precision_score(true,pred),
            recall_score(true,pred),
            f1_score(true,pred))

In [4]:
def print_score(s):
    print ("""
Precision: {:0.3}
Recall:    {:0.3}
F-SCore:   {:0.3}
""".format(*s))

In [5]:
def searchTwitter(query,feed="search/tweets",api=api,n=4000):
  r = []
  qs = 0
  if len(r)==0:
    r.extend([t for t in api.request("search/tweets",{'q':query,'count':n})])
    qs +=1
  while len(r) < n:
#     print("Querrying twitter for {}. {}/{} gathered.".format(query,len(r),n))
    last = r[-1]['id']
    r.extend([t for t in api.request("search/tweets",{'q':query,'count':n,
                                                        'max_id':last})])
    qs += 1
    if qs > 180:
      time.sleep(840)
      qs = 0
  return r[:n]

In [6]:
def clean_tweet(tweet):
    words = []
    for line in tweet.split():
        line = line.lower()
        # allow hashtags and account handles
        if line == '#dogs' or line == '#cats':
            pass
        elif line.startswith('#') or line.startswith('@'):
            # remove non-alphanumeric characters at the end of handle (if present)
            if line[-1].isalpha():
                #print('\t✅ '+ line)
                words.append(line)
            else:
                #print('\t✅ '+ line[0:-1])
                words.append(line[0:-1])
        # ignore non-alphanumeric values, links, and retweets
        elif not line.isalpha() or line.startswith('http') or line=='RT':
            #print('\t❌ ' + line)
            pass
        else:
            #print('\t✅ '+ line)
            words.append(line)

    return words

In [7]:
def update_dictionary_word_count(dictionary, words):
    for word in words:
        if word in dictionary:
            dictionary[word] = dictionary[word] + 1
        else:
            dictionary[word] = 1

In [8]:
def calc_dict_tweet_weight(tweet, dictionary):
    weight = 0
    for line in tweet.split():
        if line.startswith('#'):
            if line[-1].isalpha():
                if line in dictionary:
                    weight = weight + dictionary[line]
            else:
                if line[0:-1] in dictionary:
                    weight = weight + dictionary[line[0:-1]]
    
    return weight

## Get Tweets

> _**Note:** The code below may take a while (~45 seconds) as it is searching for Tweets_

In [10]:
#Tweets from Twitter
cats = searchTwitter('#cats')
dogs = searchTwitter('#dogs')

In [11]:
#converting json from Twitter into a dataframe
cats_d = pd.read_json(json.dumps(cats))
dogs_d = pd.read_json(json.dumps(dogs))

### #cats

In [None]:
#display(cats_d)

### #dogs

In [None]:
#display(dogs_d)

## Attempt #1: Word Count

The first attempt will be to count the word frequency in both classes of Tweets. The hope is if there is a distinct pattern of words used for the two Tweet classes.

Properties of the cleaned Tweet include:
* Track the accounts mentioned.
* Track the hashtags (ignoring the one classification tweet).
* Omission of both '#cats' and '#dogs' in the same tweet.
* Omission of non-alphanumeric characters

Note to self: dismissing data is already putting a bias on the results. Who is to say that cat lovers don't like adding special characters?

### #cats word frequency

In [None]:
# count word frequency for '#cats'
index = 0
cats_dict = {}
sliced_tweet = []
while index < 4000:
    sliced_tweet = clean_tweet(cats_d.iloc[index]['text'])
    update_dictionary_word_count(cats_dict, sliced_tweet)
    index = index + 1

In [None]:
# print top words found in '#cats'
popular_cat_words = sorted(cats_dict.values())[::-1][:20]
for key in cats_dict.keys():
    if cats_dict[key] in popular_cat_words:
        print(key + ':' + str(cats_dict[key]))

### #dogs word frequency

In [None]:
# count word frequency for '#dogs'
index = 0
dogs_dict = {}
sliced_tweet = []
while index < 4000:    
    # dogs
    sliced_tweet = clean_tweet(dogs_d.iloc[index]['text'])
    update_dictionary_word_count(dogs_dict, sliced_tweet)
    index = index + 1

In [None]:
# print top words found in '#cats'
popular_dog_words = sorted(dogs_dict.values())[::-1][:20]
for key in dogs_dict.keys():
    if dogs_dict[key] in popular_dog_words:
        print(key + ':' + str(dogs_dict[key]))

In [None]:
index = 0

cats_hit_count = 0
cats_miss_count = 0
dogs_hit_count = 0
dogs_miss_count = 0

while index < 4000:
    # cats
    cats_cats = calc_dict_tweet_weight(cats_d.iloc[index]['text'], cats_dict)
    cats_dogs = calc_dict_tweet_weight(cats_d.iloc[index]['text'], dogs_dict)
    if cats_cats > cats_dogs:
        cats_hit_count = cats_hit_count + 1
    else:
        cats_miss_count = cats_miss_count + 1

    # dogs
    dogs_cats = calc_dict_tweet_weight(dogs_d.iloc[index]['text'], cats_dict)
    dogs_dogs = calc_dict_tweet_weight(dogs_d.iloc[index]['text'], dogs_dict)
    if dogs_dogs > dogs_cats:
        dogs_hit_count = dogs_hit_count + 1
    else:
        dogs_miss_count = dogs_miss_count + 1

    index = index + 1
    
print('cats correct: ' + str(cats_hit_count))
print('cats incorrect: ' + str(cats_miss_count))
print('dogs correct: ' + str(dogs_hit_count))
print('dogs incorrect: ' + str(dogs_miss_count))

As the data shows, it's roughly a 50% prediction rate, which is as good as guessing. 

A better method should be to use the bag-of-words model. This way, machine learning is used to classify the 

## Attempt #2: bag-of-words with Machine Learning

> Will be using [this](https://towardsdatascience.com/machine-learning-nlp-text-classification-using-scikit-learn-python-and-nltk-c52b92a7c73a) article for reference.

In [12]:
#Get text only and replace  hashtags with blanks
cats_text = [x.replace("#cats", "") for x in cats_d['text']]
dogs_text = [x.replace("#dogs", "") for x in dogs_d['text']]

In [13]:
#Find the number of times #dogs appear in the cats data
blob = [x.find("#dogs") for x in cats_text]
type(blob)
df1 = pd.DataFrame(blob)
#df1.stack().value_counts()

In [14]:
#Create features and return sparse metrics 
vectorizer = sk_fe_text.CountVectorizer(cats_text+dogs_text)
vectorizer.fit(cats_text+dogs_text)
cats_tdm = vectorizer.transform(cats_text).toarray()
dogs_tdm = vectorizer.transform(dogs_text).toarray()

In [15]:
#Create visible matrices, combine and add the number of times #dogs appear in cats_text to the length of dog_text and subtract
#the same number from the length of cats_text
zeros = np.zeros((len(cats_text) - 185, 1))
ones = np.ones((len(dogs_text) + 185, 1))
catsdogs = np.concatenate((cats_tdm,dogs_tdm),axis=0)
y = np.ravel(np.concatenate((zeros,ones),axis=0))

In [16]:
#Create train/test split for modeling
trainX,testX,trainY,testY = train_test_split(catsdogs,y,test_size=.20)

In [17]:
#Naive Bayes
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(trainX,trainY)

GaussianNB(priors=None)

In [18]:
#Naive Bayes cont'd
print("\n\nNaive Bayes Performance")
s = score(testY,nb.predict(testX))
print_score(s)



Naive Bayes Performance

Precision: 0.939
Recall:    0.755
F-SCore:   0.837



In [19]:
#Neural Network
from sklearn.neural_network import MLPClassifier
nn = MLPClassifier()
nn.fit(trainX,trainY)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [20]:
#Neural Network cont'd
print("\n\nNeural Network Performance")
s = score(testY,nn.predict(testX))
print_score(s)



Neural Network Performance

Precision: 0.908
Recall:    0.928
F-SCore:   0.918

