In [39]:
import pandas
import numpy as np
from bokeh.charts import Area, output_notebook, show
from bokeh.palettes import Inferno11
from bokeh.models import Range1d, HoverTool
from bokeh.models.sources import ColumnDataSource
from bokeh.plotting import figure
from datetime import datetime

In [28]:
sent_mapping = {
    "pos": 1,
    "neu": 0,
    "neg": -1
}

factuality_weight = {
    "yes": 1,
    "no": 0.2
}

agree_weight = 0.5

# Average per post!!!

In [29]:
df = pandas.read_csv(
    open('../data/treatment_detected_linewise.csv', 'r'),
    usecols=['subforum', 'post_id', 'timestamp', 'sentence', 'treatments', 'thread_id', 'sentiment', 'factuality', 'agrees'],
    index_col=None,
    parse_dates=['timestamp'],
    infer_datetime_format=True
)
df = df.drop_duplicates()

In [30]:
df['month'] = df['timestamp'].values.astype('<M8[M]')

In [31]:
df['sentiment'] = df['sentiment'].apply(lambda s: sent_mapping[s])

In [32]:
df['weight'] = df.apply(lambda s: factuality_weight[s['factuality']] * (1 + (s['agrees'] * agree_weight)), axis=1)

In [33]:
df["weighted_sentiment"] = df["weight"] * df["sentiment"]

In [47]:
def post_group(group):
    group["sentiment"] = group["weighted_sentiment"].sum() / group["weight"].sum()
    group["weight"] = group["weight"].max()
    return group

for label, treatment_group in df.groupby("treatments"):
    post_scores = treatment_group.groupby('post_id')[['post_id', 'month', 'weight', 'weighted_sentiment']].apply(post_group).drop_duplicates()
    post_scores["weighted_sentiment"] = post_scores["weight"] * post_scores["sentiment"]
    month_groups = post_scores.groupby('month')
    month_scores = month_groups.apply(lambda r: r["weighted_sentiment"].sum() / r["weight"].sum()).to_frame().reset_index()
    month_scores.columns = ["month", "score"]
    p = figure(
        y_axis_label='score',
        x_axis_type='datetime',
        x_range=Range1d(np.datetime64('2011', 'Y'), np.datetime64('2018', 'Y'), bounds='auto'),
        tools='xwheel_zoom,xpan,reset,save',
        active_scroll='xwheel_zoom',
        active_drag='xpan',
        # sizing_mode='stretch_both',
    )
    p.line("month", "score", line_width=2, source=month_scores)
    show(p)
    break

In [6]:
len(df)

79231

In [44]:
output_notebook()

In [36]:
df.head(30)

Unnamed: 0,subforum,post_id,timestamp,thread_id,agrees,sentence,treatments,sentiment,factuality,weight
0,collaboration-space.109,post-185994,2016-06-03 10:59:00,what-is-this-section-about.15627,1,Like info on masking and help with anxiety and...,masking,0,no,0.3
1,collaboration-space.109,post-185994,2016-06-03 10:59:00,what-is-this-section-about.15627,1,People should probably quickly go on corticost...,steroids,0,no,0.3
2,introduce-yourself.11,post-249993,2017-05-18 22:38:00,new-to-tt.21785,0,"Accept it and use your enviroment for you, ,th...",masking,0,no,0.2
3,support.2,post-55450,2014-07-11 21:58:00,i-am-confused-about-habituation.5435,0,Gave me Steroids and antivert.,steroids,0,no,0.2
4,support.2,post-55538,2014-07-12 14:36:00,i-am-confused-about-habituation.5435,0,This time I am using 'distraction' sounds (I p...,masking,0,no,0.2
5,support.2,post-55570,2014-07-12 18:25:00,i-am-confused-about-habituation.5435,0,"I'm not on any blood pressure drugs, and I tak...",magnesium,0,yes,1.0
6,support.2,post-55587,2014-07-12 19:34:00,i-am-confused-about-habituation.5435,0,A goog thing today was that I did not do any m...,masking,0,yes,1.0
7,support.2,post-55587,2014-07-12 19:34:00,i-am-confused-about-habituation.5435,0,Masking is not really possible anyway.,masking,0,yes,1.0
8,introduce-yourself.11,post-610,2011-11-13 20:08:00,new-here-and-to-tt-and-alternative-treatments.165,0,I purchased a CD from Universal Sound Therapy ...,soundcure,0,no,0.2
9,introduce-yourself.11,post-610,2011-11-13 20:08:00,new-here-and-to-tt-and-alternative-treatments.165,0,I decided on that as another course of action ...,masking,0,no,0.2


In [44]:
treatment_groups = df.groupby("treatments")
one_year_ago = np.datetime64(datetime.now(), 'D') - np.timedelta64(365, 'D')
two_years_ago = one_year_ago - np.timedelta64(365, 'D')
for label, group in treatment_groups:
    data = {}
    data["last_year_cnt"] = len(group[
        group["timestamp"] > one_year_ago
    ])
    data["previous_year_cnt"] = len(group[
        (group["timestamp"] > two_years_ago) & (group["timestamp"] < one_year_ago)
    ])
    data["most_popular_thread"] = group['thread_id'].value_counts().idxmax(3)
    print(data)

{'last_year_cnt': 324, 'most_popular_thread': 'am-101-clinical-trial-%E2%80%94-participants-updates-and-discussion.6558', 'previous_year_cnt': 1065}
{'last_year_cnt': 313, 'most_popular_thread': 'vitamin-b12-link-between-b12-deficiency-and-tinnitus.11036', 'previous_year_cnt': 346}
{'last_year_cnt': 19, 'most_popular_thread': 'acamprosate-campral.394', 'previous_year_cnt': 59}
{'last_year_cnt': 299, 'most_popular_thread': 'acoustic-cr%C2%AE-neuromodulation-do-it-yourself-guide.1469', 'previous_year_cnt': 372}
{'last_year_cnt': 262, 'most_popular_thread': 'acupuncture-for-tinnitus-%E2%80%94-treatment-protocols.322', 'previous_year_cnt': 230}
{'last_year_cnt': 1908, 'most_popular_thread': 'antidepressants-ssris-snris-maos-tcas-tecas.768', 'previous_year_cnt': 1853}
{'last_year_cnt': 19, 'most_popular_thread': 'aripiprazole-abilify-case-reports-of-diminishing-lowering-tinnitus.21736', 'previous_year_cnt': 2}
{'last_year_cnt': 72, 'most_popular_thread': 'back-to-silence.7172', 'previous_ye

In [9]:
tr_mon = pandas.crosstab(df["treatments"], df["month"])
tr_mon["sum"] = tr_mon.sum(axis=1)
tr_mon.sort_values("sum", ascending=False, inplace=True)
tr_mon = tr_mon.drop("sum", 1)
head = tr_mon.head(10)
tail = tr_mon.tail(len(tr_mon.index) - 10).sum(axis=0)
tail.name = "Rest"
tr_mon = head.append(tail)
tr_mon = tr_mon.T
ColumnDataSource(tr_mon)

In [10]:
area = Area(
    tr_mon,
    # x=list(tr_mon),
    # y=list(tr_mon["treatments"]),
    # x_axis_type="datetime",
    #y_range=Range1d(0,2500, bounds='auto'),
    x_range=Range1d(np.datetime64('2011', 'Y'), np.datetime64('2018', 'Y'), bounds='auto'),
    tools='xwheel_zoom,xpan,reset,save',
    active_scroll='xwheel_zoom',
    active_drag='xpan',
    palette=Inferno11,
    stack=True,
    plot_width=1000
)

In [11]:
show(area)

You can access Timestamp as pandas.Timestamp
  if pd and isinstance(obj, pd.tslib.Timestamp):
