In [None]:
%load_ext autoreload
%autoreload 2
from __future__ import division, print_function, absolute_import
import numpy as np
import matplotlib.pyplot as plt
fig_width = 12
%matplotlib inline

In [None]:
import bayesianchangepoint as bcp



## performing inference: application to experimental data


get your trump data:

https://github.com/bpb27/trump_tweet_data_archive

In [None]:
from io import BytesIO
from zipfile import ZipFile
import urllib.request
import json

datafile = '/tmp/trumpets.json'
try:
    with open(datafile, 'r') as f:
        data = json.load(f)
except:
    years = [2, 3, 4, 5, 6, 7]
    years = [6, 7]
    data = []
    for year in years:
        url = urllib.request.urlopen("https://github.com/bpb27/trump_tweet_data_archive/blob/master/master_201{0}.json.zip?raw=true".format(str(year)))
        print('Downloading ', url, '...')
        with ZipFile(BytesIO(url.read())) as my_zip_file:
            for contained_file in my_zip_file.namelist():
                with my_zip_file.open(contained_file) as f:
                    data.extend(json.load(f))
    with open(datafile, 'w') as f:
        json.dump(data, f)
                

In [None]:
!ls -ltr /tmp/


In [None]:
n_tweets = len(data)

## example tweet

In [None]:
d = data[34]
d#.keys()#['followers_count']

## learning to handle datetimes

https://docs.python.org/3/library/datetime.html


In [None]:
datetimes = [data[i]['created_at'] for i in range(n_tweets)]
print('|'+datetimes[0]+'|')

In [None]:
import locale
locale.getlocale()

In [None]:
from datetime import datetime

datetimes = [datetime.strptime(data[i]['created_at'], '%a %b %d %H:%M:%S %z %Y') for i in range(n_tweets)]
print('Timestamp=', datetimes[0].timestamp())

In [None]:
from datetime import datetime

datetimes = np.array([datetime.strptime(data[i]['created_at'], '%a %b %d %H:%M:%S %z %Y').timestamp() for i in range(n_tweets)])
print('Timestamp=', (datetimes[0]))

In [None]:
from datetime import datetime

datetimes = [datetime.strptime(data[i]['created_at'], '%a %b %d %H:%M:%S %z %Y') for i in range(n_tweets)]
print('Timestamp=', datetimes[0].timestamp())

In [None]:
fig_width = 13
fig, ax = plt.subplots(figsize=(fig_width, fig_width/1.6180))
ax.plot(datetimes);

It seems that within each year, tweets are in inverse chronological order. 
Let's sort things:

In [None]:
datetimes = np.array(datetimes)
ind_tweets = np.argsort(datetimes)

fig, ax = plt.subplots(figsize=(fig_width, fig_width/1.6180))
ax.plot(datetimes[ind_tweets]);

We will be using https://matplotlib.org/examples/api/date_demo.html

## evolution of followers

In [None]:
followers = np.array([data[i]['user']['followers_count'] for i in range(n_tweets)])

fig_width = 13
fig, ax = plt.subplots(figsize=(fig_width, fig_width/1.6180))
ax.plot(datetimes[ind_tweets], followers[ind_tweets]);

In [None]:
i= 42
data_texts = []
for i in range(n_tweets):
    try:
        data_texts.append(data[i]['full_text'].lower())
    except KeyError:
        data_texts.append(data[i]['text'].lower())

data_texts[i]

In [None]:
word = 'america'
contains_word = np.array([(word in data_text) for data_text in data_texts])
print(sum(contains_word), 'tweets contain the word "', word, '" on a total of ', n_tweets, 'tweets')
p0 = sum(contains_word) / n_tweets
print('That is, an average probability of p0= ', p0)

## detecting change points



In [None]:
h=1/1500
p0=.5
p_bar, r, beliefs = bcp.inference(contains_word[ind_tweets], h=h, p0=p0)

In [None]:
fig, axs = bcp.plot_inference(contains_word[ind_tweets], None, p_bar, r, beliefs, mode='max', max_run_length=2500)

## wraping things up

In [None]:
import datetime
import matplotlib.dates as mdates
import matplotlib.cbook as cbook
years = mdates.YearLocator()   # every year
months = mdates.MonthLocator()  # every month
yearsFmt = mdates.DateFormatter('%Y')
fig_width = 13
fig, ax = plt.subplots(figsize=(fig_width, fig_width/1.6180))

p_hat, r_hat = bcp.readout(p_bar, r, beliefs)#, mode='max')

ax.plot(datetimes[ind_tweets], p_hat)
#ax.plot(datetimes)


#datemin = datetime.date(datetimes.min(), 1, 1)
#datemax = datetime.date(datetimes.max() + 1, 1, 1)
#ax.set_xlim(datemin, datemax)
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(yearsFmt)
ax.xaxis.set_minor_locator(months)
ax.format_xdata = mdates.DateFormatter('%Y-%m-%d')
#ax.set_yscale('log')
