# Import Data

First, we are going to import a small subset of the total data for exploratory data analysis. We are only interested in the tweets written in English.

In [1]:
# Import Packages
import pandas as pd
import networkx as nx


In [2]:
# Import all data
IRA_csv = pd.read_csv("Data/IRAhandle_tweets_1.csv")

# Select tweets in English
df = IRA_csv.copy()[IRA_csv['language'] == "English"]

# Convert date
df['publish_date'] = pd.to_datetime(df['publish_date'])

# Set index to date + sort
df.set_index('publish_date', inplace = True)
df.sort_index(inplace=True)

In [3]:
df.head()

Unnamed: 0_level_0,external_author_id,author,content,region,language,harvested_date,following,followers,updates,post_type,account_type,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1
publish_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
2014-11-27 09:59:00,2534421182,ABIGAILSSILK,WHAT`S WRONG WITH THIS WORLD??! It`s time to s...,United States,English,11/27/2014 9:59,158,79,261,RETWEET,Hashtager,1,HashtagGamer,0,2534421182,537908506720886785,http://twitter.com/abigailssilk/statuses/53790...,https://twitter.com/JennaTraveller/status/5379...,,
2014-11-27 17:13:00,2534421182,ABIGAILSSILK,Happy Thanksgiving! Thank you @JennaTraveller...,United States,English,11/27/2014 17:13,158,79,262,RETWEET,Hashtager,1,HashtagGamer,0,2534421182,538017841682083840,http://twitter.com/abigailssilk/statuses/53801...,https://twitter.com/andrewdrew_s/status/538016...,,
2014-11-27 17:21:00,2534421182,ABIGAILSSILK,#WhyImThankful dinner is my favourite part of ...,United States,English,11/27/2014 17:21,158,79,263,RETWEET,Hashtager,1,HashtagGamer,0,2534421182,538019747355701250,http://twitter.com/abigailssilk/statuses/53801...,https://twitter.com/andrewdrew_s/status/538019...,,
2014-11-28 09:06:00,2534421182,ABIGAILSSILK,Excuse me terribly sorry to bother you but I w...,United States,English,11/28/2014 9:06,158,78,265,,Hashtager,0,HashtagGamer,0,2534421182,538257515796893696,http://twitter.com/abigailssilk/statuses/53825...,,,
2014-11-28 09:06:00,2534421182,ABIGAILSSILK,"I ain't shoulder leaning, I ain't snapping and...",United States,English,11/28/2014 9:06,158,78,264,,Hashtager,0,HashtagGamer,0,2534421182,538257494787620865,http://twitter.com/abigailssilk/statuses/53825...,,,


In [4]:
# Rolling mean columns for plotting
df['updates1000ma'] = df['updates'].rolling(1000).mean()

In [5]:
df['account_category'].unique()

array(['HashtagGamer', 'LeftTroll', 'RightTroll', 'NonEnglish', 'Unknown',
       'Fearmonger', 'NewsFeed', 'Commercial'], dtype=object)

In [6]:
graph_df = pd.DataFrame()

for category in df['account_category'].unique():
    print(category)
    category_df = df.copy()[df['account_category'] == category]
#    category_df.set_index("publish_date", inplace = True)
#    category_df.sort_index(inplace = True)
    category_df[f'{category}_updates1000ma'] = category_df['updates'].rolling(1000).mean()
    
    if graph_df.empty:
        graph_df = category_df[[f'{category}_updates1000ma']]
    
    else: 
        graph_df = graph_df.join(category_df[f'{category}_updates1000ma'])

HashtagGamer
LeftTroll
RightTroll
NonEnglish
Unknown
Fearmonger
NewsFeed
Commercial


In [7]:
graph_df.plot(figsize=(8,5))

<matplotlib.axes._subplots.AxesSubplot at 0x116761438>

In [8]:
gb = df.groupby("account_category")
gb.get_group("RightTroll").head()

Unnamed: 0_level_0,external_author_id,author,content,region,language,harvested_date,following,followers,updates,post_type,...,retweet,account_category,new_june_2018,alt_external_id,tweet_id,article_url,tco1_step1,tco2_step1,tco3_step1,updates1000ma
publish_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-02-20 12:19:00,2497991305,AUSTINLOVESBEER,"coffee is my passion…well, at least now",United States,English,2/20/2015 12:19,53,46,9,,...,0,RightTroll,0,2497991305,568746843816853504,http://twitter.com/AustinLovesBeer/statuses/56...,,,,267.564
2015-02-20 12:22:00,2497991305,AUSTINLOVESBEER,This is how music should make you feel. I’m in...,United States,English,2/20/2015 12:22,53,46,13,,...,0,RightTroll,0,2497991305,568747427374747648,http://twitter.com/AustinLovesBeer/statuses/56...,,,,267.541
2015-02-20 12:22:00,2497991305,AUSTINLOVESBEER,Love is when the other person's happiness is m...,United States,English,2/20/2015 12:22,53,46,15,,...,0,RightTroll,0,2497991305,568747570207596545,http://twitter.com/AustinLovesBeer/statuses/56...,,,,267.519
2015-02-20 12:22:00,2497991305,AUSTINLOVESBEER,"If God had really intended men to fly, he'd ma...",United States,English,2/20/2015 12:22,53,46,13,,...,0,RightTroll,0,2497991305,568747540738379776,http://twitter.com/AustinLovesBeer/statuses/56...,,,,267.494
2015-02-20 12:23:00,2497991305,AUSTINLOVESBEER,They say everything happens for a reason. Some...,United States,English,2/20/2015 12:24,53,46,16,,...,0,RightTroll,0,2497991305,568747709152276481,http://twitter.com/AustinLovesBeer/statuses/56...,,,,267.467


In [13]:
df.describe()

Unnamed: 0,external_author_id,following,followers,updates,retweet,new_june_2018,alt_external_id,tweet_id,updates1000ma
count,190252.0,190252.0,190252.0,190252.0,190252.0,190252.0,190252.0,190252.0,189253.0
mean,2.711996e+17,2336.039926,2649.535658,6664.817894,0.557061,0.104256,2.712156e+17,7.980574e+17,6693.132338
std,4.090159e+17,3994.378085,5264.29198,8963.273711,0.496735,0.305594,4.090404e+17,9.539194e+16,5625.911033
min,34976400.0,0.0,0.0,1.0,0.0,0.0,34976400.0,5.379085e+17,196.662
25%,1679279000.0,77.0,133.0,886.0,0.0,0.0,1679279000.0,7.445466e+17,1854.023
50%,2570250000.0,1007.0,744.0,2956.0,1.0,0.0,2570250000.0,8.160522e+17,5436.794
75%,8.91e+17,2335.0,2466.0,8649.0,1.0,0.0,8.912025e+17,8.946032e+17,10020.101
max,9.06e+17,21843.0,23890.0,69979.0,1.0,1.0,9.058747e+17,9.766389e+17,24378.556


In [17]:
accounts_max = df.groupby(['account_category', 'author']).max()

In [19]:
accounts_max.describe()

Unnamed: 0,alt_external_id,external_author_id,followers,following,new_june_2018,retweet,tweet_id,updates,updates1000ma
count,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0,249.0
mean,3.024771e+17,3.025013e+17,705.257028,981.393574,0.285141,0.823293,7.740919e+17,2263.919679,8041.356811
std,4.179341e+17,4.17968e+17,2273.605614,1991.073836,0.452391,0.382189,1.074534e+17,5652.591184,6458.641149
min,34976400.0,34976400.0,0.0,0.0,0.0,0.0,5.985325e+17,3.0,491.221
25%,2176657000.0,2176657000.0,51.0,60.0,0.0,1.0,6.717084e+17,336.0,2720.573
50%,2592433000.0,2592433000.0,135.0,284.0,0.0,1.0,7.459857e+17,630.0,6335.768
75%,8.886125e+17,8.89e+17,448.0,1026.0,1.0,1.0,8.920644e+17,2526.0,10726.432
max,9.058747e+17,9.06e+17,23890.0,21843.0,1.0,1.0,9.766389e+17,69979.0,24378.556


In [22]:
df.corr()

Unnamed: 0,external_author_id,following,followers,updates,retweet,new_june_2018,alt_external_id,tweet_id,updates1000ma
external_author_id,1.0,-0.217678,-0.299906,-0.40213,-0.653855,-0.103442,1.0,0.660903,-0.531791
following,-0.217678,1.0,0.94257,0.153299,-0.049104,0.542493,-0.217665,-0.124083,0.046523
followers,-0.299906,0.94257,1.0,0.190446,-0.02921,0.464374,-0.299907,-0.155571,0.109666
updates,-0.40213,0.153299,0.190446,1.0,0.368913,-0.075519,-0.402123,0.012039,0.612405
retweet,-0.653855,-0.049104,-0.02921,0.368913,1.0,0.012801,-0.65386,-0.315213,0.454864
new_june_2018,-0.103442,0.542493,0.464374,-0.075519,0.012801,1.0,-0.103448,-0.172069,0.004372
alt_external_id,1.0,-0.217665,-0.299907,-0.402123,-0.65386,-0.103448,1.0,0.660904,-0.531806
tweet_id,0.660903,-0.124083,-0.155571,0.012039,-0.315213,-0.172069,0.660904,1.0,0.013792
updates1000ma,-0.531791,0.046523,0.109666,0.612405,0.454864,0.004372,-0.531806,0.013792,1.0


## Tweets by Category

In [None]:
# Define count_entries()
def count_entries(df, *args):
    """Return a dictionary with counts of
    occurrences as value for each key."""
    
    #Initialize an empty dictionary: cols_count
    cols_count = {}
    
    # Iterate over column names in args
    for col_name in args:
    
        # Extract column from DataFrame: col
        col = df[col_name]
    
        # Iterate over the column in DataFrame
        for entry in col:
    
            # If entry is in cols_count, add 1
            if entry in cols_count.keys():
                cols_count[entry] += 1
    
            # Else add the entry to cols_count, set the value to 1
            else:
                cols_count[entry] = 1

    # Return the cols_count dictionary
    return cols_count

# Call count_entries(): result
cat_count = count_entries(df, 'account_category')

cat_count

# Basic Visualizations

Not all of the tweets in this dataset are in English. For the purposes of this project, we are only interested in the English tweets.

# Network Analysis

We are interested in the structure of the social network within which these accounts reside. 

Degree, centrality, pagerank?

# Key Influencers 

We can see that there are a few accounts which stand out as being influential. What are their common characteristics?

# Credibility Score 



# Sentiment Analysis

Previous studies have show that influential Twitter accounts tend to have a distinct sentiment 

# Things to Investigate / Incorporate

    Strong postive correlation between followers - following.
    
    There is a hashtagger account named "Andy Hashtagger"