In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()
sns.set(style="whitegrid")

In [None]:
def save_to_pdf(filename, ax):
    fig = ax.get_figure()
    fig.savefig(filename, bbox_inches='tight')

In [None]:
df = pd.read_csv('../data/activity/daily-mail.csv')

In [None]:
df.head(1)

In [None]:
df.dtypes

In [None]:
df['min_timestamp'] = pd.to_datetime(df['min_timestamp'])
df['max_timestamp'] = pd.to_datetime(df['max_timestamp'])

In [None]:
df['mean_timestamp'] = df['min_timestamp'] + (df['max_timestamp'] - df['min_timestamp'])/2

In [None]:
df['min_week_start'] = df['min_timestamp'].dt.to_period('W').apply(lambda r: r.start_time)
df['max_week_start'] = df['max_timestamp'].dt.to_period('W').apply(lambda r: r.start_time)

In [None]:
df = df[df['count_posts'] > 5]
df.sort_values('mean_timestamp', inplace=True)

In [None]:
print('number of weeks', df['min_week_start'].nunique())
print('number of weeks', df['max_week_start'].nunique())

In [None]:
print('Unique users', df[df['count_posts'] > 5]['author_id'].nunique())

In [None]:
df.describe()

In [None]:
print(df['min_timestamp'].min())
print(df['max_timestamp'].max())
print('therefore 611 weeks')

In [None]:
from datetime import timedelta, date

def daterange(date1, date2):
    for n in range(int ((date2 - date1).days)+1):
        yield date1 + timedelta(n)

start_dt = df['min_timestamp'].min()
end_dt = df['max_timestamp'].max()
dates = []
for dt in daterange(start_dt, end_dt):
    dates.append(dt)

In [None]:
dates_df = pd.DataFrame(dates)
dates_df = pd.to_datetime(dates_df[0])
dates_df = dates_df.reset_index()

In [None]:
dates_df['week_of_date'] = dates_df[0].dt.to_period('W').apply(lambda r: r.start_time)

In [None]:
unqiue_dates = pd.to_datetime(dates_df['week_of_date']).unique()
unqiue_dates.sort()

In [None]:
len(unqiue_dates)

In [None]:
map_timestamp_to_index = { v: k for k, v in pd.DataFrame(unqiue_dates).to_dict()[0].items()}

In [None]:
df.index = range(len(df))

In [None]:
heatmap = np.zeros((df['author_id'].count(), 439))
count_errors = 0

for index, row in df.iterrows():
    start_index = map_timestamp_to_index[row['min_week_start']]
    end_index = map_timestamp_to_index[row['max_week_start']]
    for i in range(start_index, end_index +1):
        if index == 555009:
            print(index, i)
        heatmap[index, i] = row['count_posts']
print('Finished with ', count_errors, 'Errors')    

In [None]:
np.save('../data/activity/heatmap_dailymail_ordered_max.npy', heatmap)

In [None]:
heatmap = np.load('../data/activity/heatmap_guardian_ordered_max.npy')

In [None]:
from sklearn import preprocessing

In [None]:
scaler = preprocessing.MinMaxScaler(copy=True, feature_range=(0, 1))
scaler.fit(heatmap)
scaled = scaler.transform(heatmap)
scaled[scaled > 0] = 1

In [None]:
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics import jaccard_similarity_score
x = preprocessing.scale(heatmap)
cosine_similarities = linear_kernel(x[0:1], x)

In [None]:
cosine_similarities

In [None]:
import matplotlib.dates as mdates
myFmt = mdates.DateFormatter('%Y')

fig, ax = plt.subplots(figsize=(13,6))
# ax.xaxis.set_major_formatter(FuncFormatter(format_fn))
ax.imshow(scaled, interpolation="nearest", cmap='Blues', aspect='auto', vmin=0, vmax=1)
index_to_date = pd.DataFrame(unqiue_dates).to_dict()[0]
ax.set_xticklabels([index_to_date[date].year for date in [0, 0, 50, 100, 150, 200, 250, 300, 350, 400]])
save_to_pdf('./plots/dailymail-daily_heatmap.pdf', ax)

In [None]:
sum_histogram = np.sum(scaled, axis=0)

In [None]:
plt.figure(figsize=(8,6))
ax = sns.lineplot(x = unqiue_dates, y=sum_histogram, linewidth=2.5, markers=True, dashes=True)
l1 = ax.lines[0]

# Get the xy data from the lines so that we can shade
x1 = l1.get_xydata()[:,0]
y1 = l1.get_xydata()[:,1]
ax.fill_between(x1,y1, color="blue", alpha=0.1)
ax.set(xlabel='Date', ylabel='Number of Comments', title='Guardian')
save_to_pdf('./plots/dailymail-daily_user_count.pdf', ax)