In [18]:
import pandas
import numpy as np
from bokeh.charts import Area, output_notebook, show
from bokeh.palettes import Inferno11, Inferno3
from bokeh.models import Range1d, HoverTool
from bokeh.models.sources import ColumnDataSource
from bokeh.plotting import figure
from datetime import datetime

In [16]:
sent_mapping = {
    "pos": 1,
    "neu": 0,
    "neg": -1
}

factuality_weight = {
    "yes": 1,
    "no": 0.2
}

agree_weight = 0.5

x_range = Range1d(np.datetime64('2011', 'Y'), np.datetime64('2018', 'Y'), bounds='auto')

# Average per post!!!

In [3]:
df = pandas.read_csv(
    open('../data/treatment_detected_linewise.csv', 'r'),
    usecols=['subforum', 'post_id', 'timestamp', 'sentence', 'treatments', 'thread_id', 'sentiment', 'factuality', 'agrees'],
    index_col=None,
    parse_dates=['timestamp'],
    infer_datetime_format=True
)
df = df.drop_duplicates()

In [4]:
df['month'] = df['timestamp'].values.astype('<M8[M]')

In [5]:
df['sentiment'] = df['sentiment'].apply(lambda s: sent_mapping[s])

In [6]:
df['weight'] = df.apply(lambda s: factuality_weight[s['factuality']] * (1 + (s['agrees'] * agree_weight)), axis=1)

In [7]:
df["weighted_sentiment"] = df["weight"] * df["sentiment"]

In [78]:
def post_group(group):
    group["sentiment"] = group["weighted_sentiment"].sum() / group["weight"].sum()
    group["weight"] = group["weight"].max()
    return group


def month_group(group):
    group["score"] = group["weighted_sentiment"].sum() / group["weight"].sum()
    group["pos_cnt"] = len(group[group["weighted_sentiment"] > 0])
    group["neu_cnt"] = len(group[group["weighted_sentiment"] == 0])
    group["neg_cnt"] = len(group[group["weighted_sentiment"] < 0])
    del group["weighted_sentiment"]
    del group["weight"]
    return group

month_data = {}

for label, treatment_group in df.groupby("treatments"):
    post_scores = treatment_group.groupby('post_id')[['post_id', 'month', 'weight', 'weighted_sentiment']].apply(post_group).drop_duplicates()
    post_scores["weighted_sentiment"] = post_scores["weight"] * post_scores["sentiment"]
    month_groups = post_scores.groupby('month')['month', 'weight', 'weighted_sentiment']
    month_scores = month_groups.apply(month_group).drop_duplicates().sort_values('month')
    month_scores.set_index('month', inplace=True)
    month_scores = month_scores.reindex(pandas.DatetimeIndex(np.arange('2011-01', '2017-12', dtype='datetime64[M]')), fill_value=0)
    month_scores['month'] = month_scores.index
    print(month_scores)
    month_data[label] = month_scores
    break

               score  pos_cnt  neu_cnt  neg_cnt      month
2011-01-01  0.000000        0        0        0 2011-01-01
2011-02-01  0.000000        0        0        0 2011-02-01
2011-03-01 -0.500000        0        1        1 2011-03-01
2011-04-01  0.000000        0        0        0 2011-04-01
2011-05-01  0.000000        0        0        0 2011-05-01
2011-06-01  0.000000        0        0        0 2011-06-01
2011-07-01  0.000000        0        0        0 2011-07-01
2011-08-01  0.000000        0        0        0 2011-08-01
2011-09-01  0.000000        0        0        0 2011-09-01
2011-10-01  0.000000        0        0        0 2011-10-01
2011-11-01  0.000000        0        0        0 2011-11-01
2011-12-01  0.000000        0        0        0 2011-12-01
2012-01-01  0.000000        0        0        0 2012-01-01
2012-02-01  0.333333        2        0        1 2012-02-01
2012-03-01  0.000000        0        0        0 2012-03-01
2012-04-01  0.000000        0        1        0 2012-04-

In [68]:
def stacked(df, categories):
    areas = dict()
    last = np.zeros(len(df[categories[0]]))
    for cat in categories:
        next = last + df[cat]
        areas[cat] = np.hstack((last[::-1], next))
        last = next
    return areas

In [73]:
data = month_data["AM-101"]
data['pos+neu_cnt'] = data['pos_cnt'] + data['neu_cnt']
data['all_cnt'] = data['pos+neu_cnt'] + data['neg_cnt']
categories = ["pos_cnt", "neu_cnt", "neg_cnt"]
areas = stacked(data, categories)
months = list(data["month"])
x2 = np.hstack((months[::-1], months))
p = figure(
    y_axis_label='score',
    x_axis_type='datetime',
    x_range=x_range,
    tools='xwheel_zoom,xpan,reset,save',
    active_scroll='xwheel_zoom',
    active_drag='xpan',
    plot_width=800,
    plot_height=500
    # sizing_mode='stretch_both',
)
hover = HoverTool(names=["pos", "neu", "neg"])
hover.tooltips = [
    ("month", "@month{%B %Y}"),
    ("negative", "@neg_cnt"),
    ("positive", "@pos_cnt"),
    ("neutral", "@neu_cnt"),
    ("total", "@all_cnt")
]
hover.formatters = {"month": "datetime"}
p.add_tools(hover)
p.patches(
    [x2] * len(areas), [areas[cat] for cat in categories],
    color=["#5fad56", "#f2c14e", "#df2935"], alpha=0.8, line_color=None)
p.line("month", "pos_cnt", color="#5fad56", legend="positive", name="pos", source=ColumnDataSource.from_df(data))
p.line("month", "pos+neu_cnt", color="#f2c14e", legend="neutral", name="neu", source=ColumnDataSource.from_df(data))
p.line("month", "all_cnt", color="#df2935", legend="negative", name="neg", source=ColumnDataSource.from_df(data))
show(p)

In [6]:
len(df)

79231

In [20]:
output_notebook()

In [36]:
df.head(30)

Unnamed: 0,subforum,post_id,timestamp,thread_id,agrees,sentence,treatments,sentiment,factuality,weight
0,collaboration-space.109,post-185994,2016-06-03 10:59:00,what-is-this-section-about.15627,1,Like info on masking and help with anxiety and...,masking,0,no,0.3
1,collaboration-space.109,post-185994,2016-06-03 10:59:00,what-is-this-section-about.15627,1,People should probably quickly go on corticost...,steroids,0,no,0.3
2,introduce-yourself.11,post-249993,2017-05-18 22:38:00,new-to-tt.21785,0,"Accept it and use your enviroment for you, ,th...",masking,0,no,0.2
3,support.2,post-55450,2014-07-11 21:58:00,i-am-confused-about-habituation.5435,0,Gave me Steroids and antivert.,steroids,0,no,0.2
4,support.2,post-55538,2014-07-12 14:36:00,i-am-confused-about-habituation.5435,0,This time I am using 'distraction' sounds (I p...,masking,0,no,0.2
5,support.2,post-55570,2014-07-12 18:25:00,i-am-confused-about-habituation.5435,0,"I'm not on any blood pressure drugs, and I tak...",magnesium,0,yes,1.0
6,support.2,post-55587,2014-07-12 19:34:00,i-am-confused-about-habituation.5435,0,A goog thing today was that I did not do any m...,masking,0,yes,1.0
7,support.2,post-55587,2014-07-12 19:34:00,i-am-confused-about-habituation.5435,0,Masking is not really possible anyway.,masking,0,yes,1.0
8,introduce-yourself.11,post-610,2011-11-13 20:08:00,new-here-and-to-tt-and-alternative-treatments.165,0,I purchased a CD from Universal Sound Therapy ...,soundcure,0,no,0.2
9,introduce-yourself.11,post-610,2011-11-13 20:08:00,new-here-and-to-tt-and-alternative-treatments.165,0,I decided on that as another course of action ...,masking,0,no,0.2


In [44]:
treatment_groups = df.groupby("treatments")
one_year_ago = np.datetime64(datetime.now(), 'D') - np.timedelta64(365, 'D')
two_years_ago = one_year_ago - np.timedelta64(365, 'D')
for label, group in treatment_groups:
    data = {}
    data["last_year_cnt"] = len(group[
        group["timestamp"] > one_year_ago
    ])
    data["previous_year_cnt"] = len(group[
        (group["timestamp"] > two_years_ago) & (group["timestamp"] < one_year_ago)
    ])
    data["most_popular_thread"] = group['thread_id'].value_counts().idxmax(3)
    print(data)

{'last_year_cnt': 324, 'most_popular_thread': 'am-101-clinical-trial-%E2%80%94-participants-updates-and-discussion.6558', 'previous_year_cnt': 1065}
{'last_year_cnt': 313, 'most_popular_thread': 'vitamin-b12-link-between-b12-deficiency-and-tinnitus.11036', 'previous_year_cnt': 346}
{'last_year_cnt': 19, 'most_popular_thread': 'acamprosate-campral.394', 'previous_year_cnt': 59}
{'last_year_cnt': 299, 'most_popular_thread': 'acoustic-cr%C2%AE-neuromodulation-do-it-yourself-guide.1469', 'previous_year_cnt': 372}
{'last_year_cnt': 262, 'most_popular_thread': 'acupuncture-for-tinnitus-%E2%80%94-treatment-protocols.322', 'previous_year_cnt': 230}
{'last_year_cnt': 1908, 'most_popular_thread': 'antidepressants-ssris-snris-maos-tcas-tecas.768', 'previous_year_cnt': 1853}
{'last_year_cnt': 19, 'most_popular_thread': 'aripiprazole-abilify-case-reports-of-diminishing-lowering-tinnitus.21736', 'previous_year_cnt': 2}
{'last_year_cnt': 72, 'most_popular_thread': 'back-to-silence.7172', 'previous_ye

In [9]:
tr_mon = pandas.crosstab(df["treatments"], df["month"])
tr_mon["sum"] = tr_mon.sum(axis=1)
tr_mon.sort_values("sum", ascending=False, inplace=True)
tr_mon = tr_mon.drop("sum", 1)
head = tr_mon.head(10)
tail = tr_mon.tail(len(tr_mon.index) - 10).sum(axis=0)
tail.name = "Rest"
tr_mon = head.append(tail)
tr_mon = tr_mon.T
ColumnDataSource(tr_mon)

In [10]:
area = Area(
    tr_mon,
    # x=list(tr_mon),
    # y=list(tr_mon["treatments"]),
    # x_axis_type="datetime",
    #y_range=Range1d(0,2500, bounds='auto'),
    x_range=Range1d(np.datetime64('2011', 'Y'), np.datetime64('2018', 'Y'), bounds='auto'),
    tools='xwheel_zoom,xpan,reset,save',
    active_scroll='xwheel_zoom',
    active_drag='xpan',
    palette=Inferno11,
    stack=True,
    plot_width=1000
)

In [11]:
show(area)

You can access Timestamp as pandas.Timestamp
  if pd and isinstance(obj, pd.tslib.Timestamp):
