In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import re
import json
import datetime
import copy

In [None]:
%matplotlib inline

In [None]:
# tweet datasets
rdt_tweets = pd.read_csv("data/realdonaldtrump.csv")

# articles datasets (must convert .7z into csv https://www.ezyzip.com/convert-7z-to-csv.html)   
articles1 = pd.read_csv("data/articles1.csv")
articles2 = pd.read_csv("data/articles2.csv")
articles3 = pd.read_csv("data/articles3.csv")

In [None]:
def keyword_search(keyword,  df, column):
    """searches for a keyword in a specific dataframe column 
    and returs all matches into a new dataframe"""
    rows = pd.DataFrame()
    row = 0
    for string in df[column]:
        if keyword.upper() in string.upper():
            rows = rows.append(df.iloc[row])
        row += 1
    return rows

def sw_to_df(file_name):
    """converts storywrangling json data into a dataframe"""
    with open(file_name, "rt") as f:
        data = json.loads(f.read())
    
    df = pd.DataFrame(columns = ["date","frequency"])
    
    #  parses json and adds to df
    for x in data["data"].values():
        for y in x:
            df = df.append({"date": y[0], "rank": y[1]}, ignore_index=True)
    
    df["date"] = pd.to_datetime(df["date"])
            
    return df


def gt_to_df(filename):
    """converts google trends data to a dataframe"""
    df = pd.read_csv(filename).reset_index()
    df = df.rename(columns={df.columns[0]: "date", df.columns[1]: "search_popularity"})
    df = df.drop([0])
    df["date"] = pd.to_datetime(df["date"])
    df["search_popularity"] = df["search_popularity"].astype('int32')
    df = df.reset_index(drop=True)
    return df

def get_df_by_daterange(dataframe, start_date, end_date):
    """filters a dateframe by a specific daterange.
    date format: YYYY-MM-DD"""
    df = copy.copy(dataframe)
    mask = (df["date"] >= start_date) & (df["date"] <= end_date)
    df = df.loc[mask].reset_index(drop=True)
    return df


def group_days_by_week(dataframe, agg):
    """groups dates by week for twitter data"""
    df = copy.copy(dataframe)
    df['date'] = pd.to_datetime(df['date']) - pd.to_timedelta(7, unit='d')
    dfe = df.groupby(pd.Grouper(key='date', freq='W-MON'))[agg].sum().reset_index().sort_values('date')
    return df

In [None]:
# "Crooked Hillary" Term Analysis
ch_tweets = keyword_search("Crooked Hillary", rdt_tweets, 'content')
ch_gt = gt_to_df("crooked_hillary_google_trends.csv")

# gets Trump's first tweet containing "Crooked Hillary"
first_ch_tweet = ch_tweets.sort_values("date").iloc[0] 

# shortens date range of google trends data
ch_2016_gt = get_df_by_daterange(ch_gt, "2016-01-01", "2017-01-01")

# plots Trump's first tweet of "Crooked Hillary" with google trends data around same time
ch_2016_gt.plot(x="date", y="search_popularity", legend=None)
plt.axvline(x=first_ch_tweet.date, ymin=0, ymax=100, color="red")
plt.title("Crooked Hillary Google Searches")
plt.ylabel("search popularity")

Donald Trump demonstrates a strong ability to create and amplify trends for media bytes such as "Crooked Hillary".  He first tweeted  "Crooked Hillary" in April 2016 (red line)and since in the following months, search popularity skyrocketed over the next several months before dropping back down after the November election. 

In [None]:
# trumps tweets containing "Crooked Hillary" compared to twitter trends
ch_tt = sw_to_df("crooked_hillary_twitter_data.json")
ch_16_tweets = get_df_by_daterange(ch_tweets, "2016-01-01", "2017-06-01")
ch_16_tt = get_df_by_daterange(ch_tt, "2016-01-01", "2017-06-01")
ch_16_tt_weeks = group_days_by_week(ch_16_tt, 'frequency')

ch_16_tt_weeks.plot(x="date", y="rank", legend=None)
for _, row in ch_16_tweets.iterrows():
    plt.axvline(x=row.date, color="red", alpha=.05)
plt.title("Crooked Hillary Twitter Trends")
plt.ylabel("rank")
plt.gca().invert_yaxis()

In [None]:
ch_tt.head()

Donald Trump also has the ability to sustain trends. The red lines signifiy all of Donald Trump's tweets containing "Crooked Hillary". These tweets align with the highest report months for "Crooked Hillary" trends on Twitter. This doesn't necessarily imply causation, but there is a high correlation between the two indicating that he is at least using his influence to contribute to the phenomenon.

In [None]:
# Fake News Term Analysis
fn_tweets = keyword_search("fake news", rdt_tweets, 'content')
fn_gt = gt_to_df("fake_news_google_trends.csv")

# gets Trump's first tweet containing "Fake News"
first_fn_tweet = fn_tweets.sort_values("date").iloc[0]

# plots Trump's first tweet & other tweets of "Fake News" with google trends data around same time
fn_gt.plot(x="date", y="search_popularity", legend=None)
plt.axvline(x=first_fn_tweet.date, ymin=0, ymax=100, color="red")
for _, row in fn_tweets.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.05)
plt.title("Fake News Google Trends")
plt.ylabel("search popularity")

"Fake News" is an interesting case, as there was a solid amount of precedence for the term prior to Donald Trump tweeting about it. However, there is a strong case to be made that Donald Trump popularized it. His inital tweet (dark red line) is an immediate precursor to a massive spike in google searches involving the term. As you notice, there is a sligh spike prior to Donald Trump's inital tweet, but it is important to note that Donald Trump has other media outlets, especially as this was the time of the 2016 election, to utilize. It is also clear that many of Donald Trump's tweets in the following months correlate with spikes in searches (note: these are the first few months of his presidency.)

In [None]:
# trumps tweets containing "fake news" compared to twitter trends
fn_tt = sw_to_df("fake_news_twitter_data.json")
fn_16_17_tt = get_df_by_daterange(fn_tt, "2016-06-01", "2017-06-01")
fn_16_17_tt_weeks = group_days_by_week(fn_16_17_tt, 'frequency')

fn_16_17_tt_weeks.plot(x="date", y="rank", legend=None)
plt.gca().invert_yaxis()
for _, row in fn_tweets.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.05)
plt.title("Fake News Twitter Trends")
plt.ylabel("rank")

The Twitter trends for "Fake News" began their spike in ranking in November 2016 (election month) and maintained their top-ranking throughout the following months. The red lines represent Trump's tweets during this time. Again there is no clear indicator that Trump caused this sustained spike in rankings, but there is strong evidence that he contributed to its perpetuation.

In [None]:
# Lamestream Media Term Analysis
lm_tweets = keyword_search("lamestream media", rdt_tweets, 'content')
lm_gt = gt_to_df("lamestream_media_google_trends.csv")

# gets Trump's first tweet containing "Lamestream Media"
first_lm_tweet = lm_tweets.sort_values("date").iloc[0]

# plots Trump's first tweet & other tweets of "Lamestream Media" with google trends data around same time
lm_gt.plot(x="date", y="search_popularity", legend=None)
plt.axvline(x=first_lm_tweet.date, ymin=0, ymax=100, color="red")
for _, row in lm_tweets.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.05)
plt.title("Lamestream Media Google Trends")
plt.ylabel("search popularity")

"Lamestream Media" appears to be a media byte that didn't resonate as some of the others Trump has used during his time. His first tweet does appear to be the catalyst for the spikes in google searches and he tweeted it during many of the same months of the spikes in google searches. However, this media byte never really maintained the same level of magnitude as others have.

In [None]:
# trumps tweets containing "lamestream media" compared to twitter trends
lm_tt = sw_to_df("lamestream_media_twitter_data.json")
lm_19_20_tt = get_df_by_daterange(lm_tt, "2019-06-01", "2020-06-30")
lm_19_20_tt_weeks = group_days_by_week(lm_19_20_tt, 'frequency')

lm_19_20_tt_weeks.plot(x="date", y="rank", legend=None)
plt.gca().invert_yaxis()
for _, row in lm_tweets.iterrows():
    plt.axvline(x=row.date, color="red", alpha=.05)
plt.title("Lamestream Media Twitter Trends")
plt.ylabel("rank")

Again, "Lamestream Media" appears to be a failed attempt to create a viral media byte. Trump had tweeted the term around the same time as the most prevalent Twitter trends, but these trends were weak and short-lived (plus the data is rather noisy). This demonstrates that while Trump's influence may be powerful, not every attempt to guide the national discussion is successful. 

In [None]:
# gets a df of all articles containing "fake news"
fn_articles1_df = keyword_search("fake news", articles1, "content")
fn_articles2_df = keyword_search("fake news", articles2, "content")
fn_articles3_df = keyword_search("fake news", articles3, "content")
fn_articles = pd.concat([fn_articles1_df, fn_articles2_df, fn_articles3_df])
fn_articles = fn_articles.drop_duplicates().reset_index(drop=True)
fn_articles.head()

In [None]:
# total articles containing "fake news"
fake_news_count = fn_articles.size

# number of articles containing "fake news" by publisher
fn_articles_by_pub = fn_articles.groupby(["publication"]).size().reset_index(name="count")
srtd_fn_articles_by_pub = fn_articles_by_pub.sort_values("count", ascending=False)

srtd_fn_articles_by_pub.plot.bar(x="publication", y="count", legend=None)
plt.title("Published Articles Containing 'Fake News'")
plt.ylabel("# of articles")

In addition to seeing how Donald Trump creates and amplifies media trends, we wanted to explore how other media outlets play into this equation. First, we examined how many times articles containing "Fake News" were published by media outlets. Initally two outliers stand out, Breitbart who published almost 350 articles and Fox News, who published close to 0. There is too little data for us to explore our theories on why Fox News could be so low so we decided to focus on Breitbart's publishing trends and how they related to Donald Trump's tweeting trends.

In [None]:
# breitbart news articles containing "fake news" compared to Trump Tweets containing "fake news"

fn_tweets_16_17 = get_df_by_daterange(fn_tweets, "2016-01-01", "2017-12-31")

# get breitbart article count containing "fake news"
fn_articles_by_date = fn_articles.groupby(["date", "publication"]).size().reset_index(name="count")
breit_fn_articles_by_date = fn_articles_by_date[fn_articles_by_date.publication == "Breitbart"]
srtd_breit_fn_articles_by_date = breit_fn_articles_by_date.sort_values("date", ascending=True)
srtd_breit_fn_articles_by_date = get_df_by_daterange(srtd_breit_fn_articles_by_date, "2016-01-01", "2017-12-31")

# plot dates by month
srtd_breit_fn_articles_by_date['date'] = pd.to_datetime(srtd_breit_fn_articles_by_date['date'])
srtd_breit_fn_articles_by_date.set_index('date', inplace = True)
srtd_breit_fn_articles_by_date.resample('1M').count()['count'].plot()

# plot trump tweets
plt.axvline(x=first_fn_tweet.date, color="red")
for _, row in fn_tweets_16_17.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.05)
plt.title("Published Breitbart Articles Containing 'Fake News'")
plt.ylabel("# of articles")
plt.ylim((0, 30))

Breitbart's publishing of articles containing "Fake News" skyrocketed starting in November 2016 (election month) and reached its peak in December 2016. Trump's tweets containing "Fake News" began around the same tim, starting in December and continuing throughout the next several months at a steady pace. Whether Breitbart (a far-right conservative network) began matching Donald Trump's rhetoric or vice versa is unclear, but there is a clear link between the two and how the information is disemminated.

In [None]:
# plot trump tweets with all articles containing "fake news"
srtd_fn_articles_by_date = fn_articles_by_date.sort_values("date", ascending=True)

# group dates by month
srtd_fn_articles_by_date['date'] = pd.to_datetime(srtd_fn_articles_by_date['date'])
srtd_fn_articles_by_date.set_index('date', inplace = True)
srtd_fn_articles_by_date.resample('1M').count()['count'].plot()

plt.axvline(x=first_fn_tweet.date, color="red")
for _, row in fn_tweets_16_17.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.05)
    
plt.ylim((0, 200))
plt.title("Published Articles Containing 'Fake News'")
plt.ylabel("# of articles")

Above are the number of articles containing "Fake News" published by all media outlets in our datset. This is the first major piece of evidence that Donald Trump is the primary driving force in terms of media influence. His first tweet (in dark red) is the start of a trend of articles containing "Fake News" across many different publications, not just far-right media such as Breitbart. Regardless of which side the publications take, it is clear that Donald Trump has the ability to influence the discussion being held.

In [None]:
# gets a df of all articles containing "lamestream media"
lm_articles1_df = keyword_search("lamestream media", articles1, "content")
lm_articles2_df = keyword_search("lamestream media", articles2, "content")
lm_articles3_df = keyword_search("lamestream media", articles3, "content")
lm_articles = pd.concat([lm_articles1_df, lm_articles2_df, lm_articles3_df])
lm_articles = lm_articles.drop_duplicates().reset_index(drop=True)
lm_articles.head()

In [None]:
# total articles containing "lamestream media"
lamestream_media_count = lm_articles.size

# number of articles containing "lamestream media" by publisher
lm_articles_by_pub = lm_articles.groupby(["publication"]).size().reset_index(name="count")
srtd_lm_articles_by_pub = lm_articles_by_pub.sort_values("count", ascending=False)

srtd_lm_articles_by_pub.plot.bar(x="publication", y="count", legend=None)
plt.title("Published Articles Containing 'Lamestream Media'")
plt.ylabel("# of articles")

"Lamestream Media" was a term that never really took off on Twitter or on Google, making it no surprise that there aren't many published articles containing the term.

In [None]:
# gets a df of all articles containing "crooked hillary"
ch_articles1_df = keyword_search("crooked hillary", articles1, "content")
ch_articles2_df = keyword_search("crooked hillary", articles2, "content")
ch_articles3_df = keyword_search("crooked hillary", articles3, "content")
ch_articles = pd.concat([ch_articles1_df, ch_articles2_df, ch_articles3_df])
ch_articles = ch_articles.drop_duplicates().reset_index(drop=True)
ch_articles.head()

In [None]:
# total articles containing "crooked hillary"
crooked_hillary_count = ch_articles.size

# number of articles containing "crooked hillary" by publisher
ch_articles_by_pub = ch_articles.groupby(["publication"]).size().reset_index(name="count")
srtd_ch_articles_by_pub = ch_articles_by_pub.sort_values("count", ascending=False)

srtd_ch_articles_by_pub.plot.bar(x="publication", y="count", legend=None)
plt.title("Published Articles Containing 'Crooked Hillary'")
plt.ylabel("# of articles")

In [None]:
# breitbart news articles containing "crooked hillary" compared to Trump Tweets containing "crooked hillary"

ch_tweets_16_17 = get_df_by_daterange(ch_tweets, "2016-01-01", "2017-12-31")

# get breitbart article count containing "crooked hillary"
ch_articles_by_date = ch_articles.groupby(["date", "publication"]).size().reset_index(name="count")
breit_ch_articles_by_date = ch_articles_by_date[ch_articles_by_date.publication == "Breitbart"]
srtd_breit_ch_articles_by_date = breit_ch_articles_by_date.sort_values("date", ascending=True)
srtd_breit_ch_articles_by_date = get_df_by_daterange(srtd_breit_ch_articles_by_date, "2016-01-01", "2017-12-31")

# plot dates by month
srtd_breit_ch_articles_by_date['date'] = pd.to_datetime(srtd_breit_ch_articles_by_date['date'])
srtd_breit_ch_articles_by_date.set_index('date', inplace = True)
srtd_breit_ch_articles_by_date.resample('1M').count()['count'].plot()

# plot trump tweets
for _, row in ch_tweets_16_17.iterrows():
    plt.axvline(x=row.date, color="red", alpha=.05)
plt.ylim((0, 16))
plt.title("Published Breitbart Articles Containing 'Crooked Hillary'")
plt.ylabel("# of articles")

Here is another example of Breitbart publishing a number of articles containing a phrase ("Crooked Hillary") around the same time Trump was tweeting about it. Trump had tweeted quite a bit during this time range as indicated by the darker red lines. Additionally, both Trump's tweeting of the phrase and Breitbart's publishing taper off at around the same time.

In [None]:
# plot trump tweets with all articles containing "crooked hillary"
srtd_ch_articles_by_date = ch_articles_by_date.sort_values("date", ascending=True)

# group dates by month
srtd_ch_articles_by_date['date'] = pd.to_datetime(srtd_ch_articles_by_date['date'])
srtd_ch_articles_by_date.set_index('date', inplace = True)
srtd_ch_articles_by_date.resample('1M').count()['count'].plot()

plt.axvline(x=first_ch_tweet.date, color="black")
for _, row in ch_tweets_16_17.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.05)
plt.ylim((0,70))
plt.title("Published Articles Containing 'Crooked Hillary'")
plt.ylabel("# of articles")

Other media outlets don't appear to jump on the trends at quite the same speed as Breitbart, but there is strong evidence that suggests they will cover what Donald Trump tweets. Many of these articles were published prior to Donald Trump while Donald Trump was campaigning for President. The term "Crooked Hillary", unlike "Fake News" was not in the common lexicon prior to the 2016 election, becuase it was invented as an insult and a means of undermining the democratic candidate. And our data suggest, that the media played into this plan by perpetuation and amplifying the term, because of Donald Trumps continued use of it.

In [None]:
# "Witch Hunt" Term Analysis
witch_tweets = keyword_search("witch hunt", rdt_tweets, 'content')
witch_gt = gt_to_df("witch_hunt_google_trends.csv")

# gets Trump's first tweet containing "witch hunt"
first_witch_tweet = witch_tweets.sort_values("date").iloc[0] 

# shortens date range of google trends data
witch_2017_gt = get_df_by_daterange(witch_gt, "2011-01-01", "2017-01-01")

# plots Trump's first tweet of "witch hunt" with google trends data around same time
witch_2017_gt.plot(x="date", y="search_popularity")
plt.axvline(x=first_witch_tweet.date, ymin=0, ymax=100, color="red")
plt.title("Google Trends for 'Witch Hunt'")
plt.ylabel("search frequency")

In [None]:
# trumps tweets containing "witch hunt" compared to twitter trends
witch_tt = sw_to_df("witch_hunt_twitter_data.json")
witch_17_tweets = get_df_by_daterange(witch_tweets, "2017-01-01", "2018-01-01")
witch_17_tt = get_df_by_daterange(witch_tt, "2017-01-01", "2018-01-01")
witch_17_tt_weeks = group_days_by_week(witch_17_tt, 'frequency')

witch_17_tt_weeks.plot(x="date", y="rank")
plt.gca().invert_yaxis()
for _, row in witch_17_tweets.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.5)
plt.title("Twitter Trends for 'Witch Hunt'")
plt.ylabel("rank")

In [None]:
# "China Virus" Term Analysis
chinavirus_tweets = keyword_search("china virus", rdt_tweets, 'content')
chinavirus_gt = gt_to_df("china_virus_google_trends.csv")

# gets Trump's first tweet containing "China Virus"
first_chinavirus_tweet = chinavirus_tweets.sort_values("date").iloc[0] 
print(first_chinavirus_tweet)

top_chinavirus_tweet = chinavirus_tweets.sort_values("retweets").iloc[0] 
print(top_chinavirus_tweet)

# shortens date range of google trends data
chinavirus_2020_gt = get_df_by_daterange(chinavirus_gt, "2020-01-01", "2021-01-01")

# plots Trump's first tweet of "China Virus" with google trends data around same time
chinavirus_2020_gt.plot(x="date", y="search_popularity")
plt.axvline(x=first_chinavirus_tweet.date, ymin=0, ymax=100, color="red")
plt.title("Google Trends for 'China Virus'")
plt.ylabel("search frequency")

In [None]:
# trumps tweets containing "China Virus" compared to twitter trends
cv_tt = sw_to_df("china_virus_twitter_data.json")
cv_20_tweets = get_df_by_daterange(chinavirus_tweets, "2020-03-15", "2020-07-01")
cv_20_tt = get_df_by_daterange(cv_tt, "2020-03-15", "2020-07-01")
cv_20_tt_weeks = group_days_by_week(cv_20_tt, 'frequency')

cv_20_tt_weeks.plot(x="date", y="rank")

plt.gca().invert_yaxis()
for _, row in cv_20_tweets.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.5)
plt.title("Twitter Trends for 'China Virus'")
plt.ylabel("rank")

In [None]:
# "Collusion" Term Analysis
collusion_tweets = keyword_search("collusion", rdt_tweets, 'content')
collusion_gt = gt_to_df("collusion_google_trends.csv")
collusion_tt = sw_to_df("collusion_twitter_data.json")

# gets Trump's first tweet containing "collusion"
first_collusion_tweet = collusion_tweets.sort_values("date").iloc[0]
top_collusion_tweet = collusion_tweets.sort_values("retweets").iloc[0]
print(top_collusion_tweet)

# shortens date range of google trends data
collusion_gt = get_df_by_daterange(collusion_gt, "2016-01-01", "2019-01-01")

# plots Trump's first tweet of "Collusion" with google trends data around same time
collusion_gt.plot(x="date", y="search_popularity")
plt.axvline(x=first_collusion_tweet.date, ymin=0, ymax=100, color="red")

# plots Trump's top tweet of "Collusion" with google trends data around same time
plt.axvline(x=top_collusion_tweet.date, ymin=0, ymax=100, color="green")
plt.title("Google Trends for 'Collusion'")
plt.ylabel("search frequency")

In [None]:
# trumps tweets containing "collusion" compared to twitter trends
col_tt = sw_to_df("collusion_twitter_data.json")
col_20_tweets = get_df_by_daterange(collusion_tweets, "2017-01-01", "2018-01-01")
col_20_tt = get_df_by_daterange(col_tt, "2017-01-01", "2018-01-01")
col_20_tt_weeks = group_days_by_week(col_20_tt, 'frequency')

col_20_tt_weeks.plot(x="date", y="rank")

plt.gca().invert_yaxis()
for _, row in col_20_tweets.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.5)
plt.title("Twitter Trends for 'Collusion'")
plt.ylabel("rank")

In [None]:
# gets a df of all articles containing "collusion"
col_articles1_df = keyword_search("collusion", articles1, "content")
col_articles2_df = keyword_search("collusion", articles2, "content")
col_articles3_df = keyword_search("collusion", articles3, "content")
col_articles = pd.concat([col_articles1_df, col_articles2_df, col_articles3_df])
col_articles = col_articles.drop_duplicates().reset_index(drop=True)
col_articles.head()

In [None]:
# total articles containing "collusion"
collusion_count = fn_articles.size

# number of articles containing "collusion" by publisher
col_articles_by_pub = col_articles.groupby(["publication"]).size().reset_index(name="count")
srtd_col_articles_by_pub = col_articles_by_pub.sort_values("count", ascending=False)

srtd_col_articles_by_pub.plot.bar(x="publication", y="count")
plt.title("Published Articles Containing 'Collusion'")
plt.ylabel("# of articles")

In [None]:
# breitbart news articles containing "collusion" compared to Trump Tweets containing "collusion"

col_tweets_16_17 = get_df_by_daterange(collusion_tweets, "2016-01-01", "2018-06-01")

# get breitbart article count containing "collusion"
col_articles_by_date = col_articles.groupby(["date", "publication"]).size().reset_index(name="count")
breit_col_articles_by_date = col_articles_by_date[col_articles_by_date.publication == "Breitbart"]
srtd_breit_col_articles_by_date = breit_col_articles_by_date.sort_values("date", ascending=True)
srtd_breit_col_articles_by_date = get_df_by_daterange(srtd_breit_col_articles_by_date, "2016-01-01", "2018-06-01")

# plot dates by month
srtd_breit_col_articles_by_date['date'] = pd.to_datetime(srtd_breit_col_articles_by_date['date'])
srtd_breit_col_articles_by_date.set_index('date', inplace = True)
srtd_breit_col_articles_by_date.resample('1M').count()['count'].plot()

# plot trump tweets
plt.axvline(x=first_collusion_tweet.date, ymin=0, ymax=100, color="green")
for _, row in col_tweets_16_17.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=1)
plt.title("Breitbart Articles & Trump Tweets Containing 'Collusion'")
plt.ylabel("# of articles")

In [None]:
# plot trump tweets with all articles containing "collusion"
srtd_col_articles_by_date = col_articles_by_date.sort_values("date", ascending=True)

# group dates by month
srtd_col_articles_by_date['date'] = pd.to_datetime(srtd_col_articles_by_date['date'])
srtd_col_articles_by_date.set_index('date', inplace = True)
srtd_col_articles_by_date.resample('1M').count()['count'].plot()

plt.axvline(x=first_collusion_tweet.date, ymin=0, ymax=100, color="green")
for _, row in col_tweets_16_17.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.1)
plt.title("All Articles & Trump Tweets Containing 'Collusion'")
plt.ylabel("# of articles")

In [None]:
# gets a df of all articles containing "witch hunt"
wh_articles1_df = keyword_search("witch hunt", articles1, "content")
wh_articles2_df = keyword_search("witch hunt", articles2, "content")
wh_articles3_df = keyword_search("witch hunt", articles3, "content")
wh_articles = pd.concat([wh_articles1_df, wh_articles2_df, wh_articles3_df])
wh_articles = wh_articles.drop_duplicates().reset_index(drop=True)
wh_articles.head()

In [None]:
# total articles containing "witch hunt"
witch_media_count = wh_articles.size

# number of articles containing "witch hunt" by publisher
wh_articles_by_pub = wh_articles.groupby(["publication"]).size().reset_index(name="count")
srtd_wh_articles_by_pub = wh_articles_by_pub.sort_values("count", ascending=False)

srtd_wh_articles_by_pub.plot.bar(x="publication", y="count")
plt.title("All Articles Containing 'Witch Hunt'")
plt.ylabel("# of articles")

In [None]:
# Washington Post news articles containing "witch hunt" compared to Trump Tweets containing "witch hunt"

wh_tweets_16_17 = get_df_by_daterange(witch_tweets, "2016-01-01", "2018-01-01")

# get breitbart article count containing "witch hunt"
wh_articles_by_date = wh_articles.groupby(["date", "publication"]).size().reset_index(name="count")
wp_wh_articles_by_date = wh_articles_by_date[wh_articles_by_date.publication == "Washington Post"]
srtd_wp_wh_articles_by_date = wp_wh_articles_by_date.sort_values("date", ascending=True)
srtd_wp_wh_articles_by_date = get_df_by_daterange(srtd_wp_wh_articles_by_date, "2016-01-01", "2018-01-01")

# plot dates by month
srtd_wp_wh_articles_by_date['date'] = pd.to_datetime(srtd_wp_wh_articles_by_date['date'])
srtd_wp_wh_articles_by_date.set_index('date', inplace = True)
srtd_wp_wh_articles_by_date.resample('1M').count()['count'].plot()

# plot trump tweets
for _, row in wh_tweets_16_17.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=1)
plt.title("Washington Post Articles & Trump Tweets Containing 'Witch Hunt'")
plt.ylabel("# of articles")

In [None]:
# plot trump tweets with all articles containing "witch hunt"
srtd_wh_articles_by_date = wh_articles_by_date.sort_values("date", ascending=True)

# group dates by month
srtd_wh_articles_by_date['date'] = pd.to_datetime(srtd_wh_articles_by_date['date'])
srtd_wh_articles_by_date.set_index('date', inplace = True)
srtd_wh_articles_by_date.resample('1M').count()['count'].plot()

plt.axvline(x=first_witch_tweet.date, ymin=0, ymax=100, color="green")
for _, row in wh_tweets_16_17.iterrows():
    plt.axvline(x=row.date, ymin=0, ymax=100, color="red", alpha=.5)
plt.title("All Articles & Trump Tweets Containing 'Witch Hunt'")
plt.ylabel("# of articles")