In [1]:
import json
import time
import math
import os
from collections import defaultdict
from statistics import mean, stdev
from csv import DictWriter
from operator import itemgetter, attrgetter

from searchtweets import ResultStream, gen_rule_payload, load_credentials, collect_results
import pandas as pd
import pycld2 as cld2
from textblob import TextBlob
from transformers import pipeline
import warnings
warnings.filterwarnings('ignore')

TWITTER_RESULTS_FIELD_NAMES = [
    "user_tweets_count", 
    "user_friends_count", 
    "user_followers_count", 
    "retweet_count", 
    "reply_count", 
    "quote_count", 
    "favorite_count", 
    "family",
]
TWITTER_DUMP_FIELD_NAMES = [
    "family",
    "serialized_tweets",
]

FAMILY_NAMES = "data/rainford_family_key.txt"
CLEANED_FAMILIES = "data/families.txt"
TWITTER_RESULTS = "data/twitter_results.csv"
TWITTER_DUMP = "data/twitter_results_tweet_dump.csv"

In [5]:
# read in family names file and clean up incomplete lines
families = []
incomplete = []
not_idae = []
with open(FAMILY_NAMES, 'r') as file:
    for line in file:
        names = line.split()
        if len(names) < 3:
            incomplete.append(names)
        family = names[1]
        if not family.endswith('idae'):
            not_idae.append(family)
        families.append(family)
not_idae

['Archaeognatha',
 'Projapygoidea',
 'Grylloblat',
 'Aleyrodoidea',
 'Aphidoidea',
 'Coccoidea',
 'Phylloxeroidea',
 'Psylloidea',
 'Mantophasm',
 'Strepsipte',
 'Zoraptera']

In [138]:
# Clean out bad data using wrong family name and remove dupe
incorrect = [pair[1] for pair in incomplete]
incorrect.remove('Zoraptera')
dump = pd.read_csv(TWITTER_DUMP)
corrected_dump = dump[~dump.family.isin(incorrect)]
corrected_dump = corrected_dump.drop_duplicates()
len(corrected_dump)
corrected_dump.to_csv("corrected_twitter_results_tweet_dump.csv")


In [65]:
# Stream in a row of tweet data
def add_row(row, header):
    with open(TWITTER_RESULTS, 'a') as write_obj:
        writer = DictWriter(write_obj, fieldnames=TWITTER_RESULTS_FIELD_NAMES)
        if header:
            writer.writeheader()
        writer.writerow(row)

# Stream in a row of serialized raw tweet data
def add_dump_row(row, header):
    with open(TWITTER_DUMP, 'a') as write_obj:
        writer = DictWriter(write_obj, fieldnames=TWITTER_DUMP_FIELD_NAMES)
        if header:
            writer.writeheader()
        writer.writerow(row)

In [71]:
# Clean up tweets and the serialized raw tweets
os.remove(TWITTER_RESULTS)
os.remove(TWITTER_DUMP)

In [6]:
# Aggregate total tweet count using the raw tweet results
family_tweet_count = {}

dump = pd.read_csv(CORRECTED_TWITTER_DUMP)
for index, row in dump.iterrows():
    tweets = json.loads(row.serialized_tweets)
    family_tweet_count[row.family] = len(tweets)
family_tweet_count

In [136]:
# Copy over data from raw tweets to the actual tweets csv
corrected_results = pd.read_csv(CORRECTED_TWITTER_RESULTS)
corrected_results['tweet_count'] = corrected_results['family'].map(family_tweet_count)
corrected_results

In [2]:
# Calculate a tweet's sentiment using transformers pipeline
nlp_sentiment = pipeline("sentiment-analysis")
def get_tf_sentiment_polarity(text):
    tf = nlp_sentiment(text)[0]
    score = tf['score']
    if tf['label'] == 'NEGATIVE':
        score *= -1
    return score
        

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [5]:
# Get sentiment data for each tweet, only using tweets that have been identified as English using the cld2 library
# Also get counts for how many tweets in each country and language
family_tweet_sentiments = {}
tweet_tf_sentiments = []
tweet_sentiments = []
language_counts = defaultdict(int)
place_counts = defaultdict(int)

dump = pd.read_csv(TWITTER_DUMP)
tweets_with_score_and_family = []
for index, row in dump.iterrows():
    print('Starting index {} / {}'.format(index, len(dump)))
    tweets = json.loads(row.serialized_tweets)
    tweet_polarities = []
    tweet_tf_polarities = []
    for tweet in tweets:
        place = tweet['place']
#         if place is not None:
#             place_counts[place['country_code']] += 1
#         else:
#             place_counts['no_location'] += 1
        text = tweet['text']
        if tweet['truncated']:
            text = tweet['extended_tweet']['full_text']
        is_reliable, text_bytes_found, language_details = cld2.detect(text)
        
        if is_reliable:
            language = language_details[0][1]
            language_counts[language] += 1
            if language == 'en':
                # calculate both the textblob sentiment as well as the sentiment from transformers pipeline
                # for comparison
                text_blob = TextBlob(text)
                polarity = text_blob.sentiment.polarity
                tf_polarity = get_tf_sentiment_polarity(text)
                data = {'text': text, 'transformers_score': tf_polarity, 'textblob_score': polarity,'family': row.family}
                tweets_with_score_and_family.append(data)
#                 tweet_tf_polarities.append(tf_polarity)
#                 tweet_tf_sentiments.append({'text': text, 'score': tf_polarity})
#                 tweet_sentiments.append({'text': text, 'score': polarity})
#                 tweet_polarities.append(polarity)
        else:
            language_counts['could_not_detect_language'] += 1
#     if not tweet_polarities:
#         continue
    # get mean and standard deviation of tweet polarities for all tweets in a family
#     num_tweets_with_sentiment = len(tweet_polarities)
#     mean_polarity = mean(tweet_polarities)
#     tf_mean_polarity = mean(tweet_tf_polarities)
#     standard_deviation_polarity = 0 if len(tweet_polarities) < 2 else stdev(tweet_polarities)
#     tf_sd_polarity = 0 if len(tweet_tf_polarities) < 2 else stdev(tweet_tf_polarities)
#     family_tweet_sentiments[row.family] = (mean_polarity, standard_deviation_polarity, num_tweets_with_sentiment, tf_mean_polarity, tf_sd_polarity)

tweets_with_score_and_family_df = pd.DataFrame(tweets_with_score_and_family)
tweets_with_score_and_family_df.to_csv('tweets_with_score_and_family.csv')
# language_counts = [{'count': count, 'language': language} for language, count in language_counts.items()]
# language_counts = sorted(language_counts, key=itemgetter('count'), reverse=True)
# language_counts_df = pd.DataFrame(language_counts)
# language_counts_df

# place_counts = [{'count': count, 'location': location} for location, count in place_counts.items()]
# place_counts = sorted(place_counts, key=itemgetter('count'), reverse=True)
# place_counts_df = pd.DataFrame(place_counts)
# Save results to CSV
# place_counts_df.to_csv('tweet_location_counts.csv')
# language_counts_df.to_csv('tweet_language_counts.csv')


Starting index 0 / 875
Starting index 1 / 875
Starting index 2 / 875
Starting index 3 / 875
Starting index 4 / 875
Starting index 5 / 875
Starting index 6 / 875
Starting index 7 / 875
Starting index 8 / 875
Starting index 9 / 875
Starting index 10 / 875
Starting index 11 / 875
Starting index 12 / 875
Starting index 13 / 875
Starting index 14 / 875
Starting index 15 / 875
Starting index 16 / 875
Starting index 17 / 875
Starting index 18 / 875
Starting index 19 / 875
Starting index 20 / 875
Starting index 21 / 875
Starting index 22 / 875
Starting index 23 / 875
Starting index 24 / 875
Starting index 25 / 875
Starting index 26 / 875
Starting index 27 / 875
Starting index 28 / 875
Starting index 29 / 875
Starting index 30 / 875
Starting index 31 / 875
Starting index 32 / 875
Starting index 33 / 875
Starting index 34 / 875
Starting index 35 / 875
Starting index 36 / 875
Starting index 37 / 875
Starting index 38 / 875
Starting index 39 / 875
Starting index 40 / 875
Starting index 41 / 875
St

In [14]:
# Get top ten and bottom ten tweet text by sentiment
ordered_tweet_tf_sentiments = sorted(tweet_tf_sentiments, key=itemgetter('score'))
tf_sentiment_scores = [sentiment['score'] for sentiment in tweet_tf_sentiments]
tf_overall_mean = mean(tf_sentiment_scores)
tf_overall_sd = stdev(tf_sentiment_scores)
tf_bottom_ten = ordered_tweet_tf_sentiments[:20]
len(tf_sentiment_scores)
# tf_top_ten = ordered_tweet_tf_sentiments[-20:]
# tf_tweets = tf_bottom_ten + tf_top_ten
# tf_tweets = [{**original, 'method': 'transformers_pipeline'} for original in tf_tweets]
# tf_tweets

# ordered_tweet_sentiments = sorted(tweet_sentiments, key=itemgetter('score'))
# bottom_ten = ordered_tweet_sentiments[:20]
# top_ten = ordered_tweet_sentiments[-20:]
# textblob_tweets = bottom_ten + top_ten
# textblob_tweets = [{**original, 'method': 'textblob'} for original in textblob_tweets]
# textblob_tweets
# df = pd.DataFrame(tf_tweets + textblob_tweets)
# df
# df.to_csv("tweet_sentiments_by_method.csv")

StatisticsError: mean requires at least one data point

In [29]:
# Calculate the sentiment mean and standard deviation using textblob's results 
# as well as transformer pipeline's results
def get_family_sentiment_mean(family):
    if family not in family_tweet_sentiments:
        return math.nan
    polarity_mean, polarity_sd, num_tweets, tf_mean_polarity, tf_sd_polarity = family_tweet_sentiments[family]
    return polarity_mean

def get_family_sentiment_sd(family):
    if family not in family_tweet_sentiments:
        return math.nan
    polarity_mean, polarity_sd, num_tweets, tf_mean_polarity, tf_sd_polarity = family_tweet_sentiments[family]
    return polarity_sd

def get_family_sentiment_num_tweets(family):
    if family not in family_tweet_sentiments:
        return 0
    polarity_mean, polarity_sd, num_tweets, tf_mean_polarity, tf_sd_polarity = family_tweet_sentiments[family]
    return num_tweets

def get_tf_family_sentiment_mean(family):
    if family not in family_tweet_sentiments:
        return math.nan
    polarity_mean, polarity_sd, num_tweets, tf_mean_polarity, tf_sd_polarity = family_tweet_sentiments[family]
    return tf_mean_polarity

def get_tf_family_sentiment_sd(family):
    if family not in family_tweet_sentiments:
        return math.nan
    polarity_mean, polarity_sd, num_tweets, tf_mean_polarity, tf_sd_polarity = family_tweet_sentiments[family]
    return tf_sd_polarity

In [52]:
# Add sentiment results from the raw tweets csv to family results csv
results = pd.read_csv(TWITTER_RESULTS)
results['num_tweets_with_sentiment'] = results['family'].map(get_family_sentiment_num_tweets)
results['mean_sentiment_polarity'] = results['family'].map(get_family_sentiment_mean)
results['standard_deviation_sentiment_polarity'] = results['family'].map(get_family_sentiment_sd)
results['sentiment_analysis_transformer_mean'] = results['family'].map(get_tf_family_sentiment_mean)
results['sentiment_analysis_transformer_sd'] = results['family'].map(get_tf_family_sentiment_sd)
results
# results.to_csv("twitter_results_with_transformers_pipeline.csv")
# results.to_csv("twitter_results_with_sentiment.csv")

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,user_tweets_count,user_friends_count,user_followers_count,retweet_count,reply_count,quote_count,favorite_count,family,tweet_count,num_tweets_with_sentiment,mean_sentiment_polarity,standard_deviation_sentiment_polarity,sentiment_analysis_transformer_mean,sentiment_analysis_transformer_sd
0,0,0,0,7196,1048,21575,2,0,0,26,Archaeognatha,2,1,0.000000,0.000000,-0.930340,0.000000
1,1,1,1,151469,6487,29757,1,9,1,32,Blaberidae,15,3,0.053333,0.092376,-0.989438,0.007005
2,2,2,2,0,0,0,0,0,0,0,Ectobiidae,0,0,,,,
3,3,3,3,375846,7042,8119,1,4,0,10,Blattidae,12,5,0.125000,0.233184,-0.586738,0.827855
4,4,4,4,30854,1232,255,0,0,0,0,Cryptocercidae,1,0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
869,869,869,872,34405,598,46471,0,1,0,5,Lepismatidae,1,0,,,,
870,870,870,873,7198,2,136,0,0,0,0,Nicoletiidae,1,1,0.136364,0.000000,-0.972827,0.000000
871,871,871,874,1825,127,62,0,1,0,1,Grylloblattidae,2,0,,,,
872,872,872,875,259175,3180,9223,2,5,0,16,Mantophasmatodea,6,1,0.000000,0.000000,-0.973013,0.000000


In [3]:
# Get the list of families into memory
families = []
with open(CLEANED_FAMILIES, 'r') as file:
    for line in file:
        families.append(line.split()[0])
families

In [2]:
# Fetch tweets from Twitter using their API and save to CSV
premium_search_args = load_credentials("credentials.yaml",
                                       yaml_key="search_tweets_api",
                                       env_overwrite=False)

header = False
for family in families:
    rule = gen_rule_payload("{family} -is:retweet".format(family=family), results_per_call=500)
    rs = ResultStream(rule_payload=rule,
                      max_results=500,
                      max_pages=0,
                      **premium_search_args)
    user_tweets_count = 0
    user_friends_count = 0
    user_followers_count = 0
    retweet_count = 0
    reply_count = 0
    quote_count = 0
    favorite_count = 0
    serialized_tweets = []
    for tweet in rs.stream():
        serialized_tweets.append(tweet)
        user_tweets_count += tweet['user']['statuses_count']
        user_friends_count += tweet.following_count
        user_followers_count += tweet.follower_count
        retweet_count += tweet.retweet_count
        reply_count += tweet['reply_count']
        quote_count += tweet.quote_count
        favorite_count += tweet.favorite_count
    row = {
        "user_tweets_count": user_tweets_count,
        "user_friends_count": user_friends_count,
        "user_followers_count": user_followers_count,
        "retweet_count": retweet_count,
        "reply_count": reply_count,
        "quote_count": quote_count,
        "favorite_count": favorite_count,
        "family": family
    }
    dump_row = {
        "serialized_tweets": json.dumps(serialized_tweets),
        "family": family
    }
    add_row(row, header=header)
    add_dump_row(dump_row, header=header)
    header = False 
    time.sleep(2)