# Start

In [None]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# warnings.simplefilter(action='ignore', category=SettingWithCopyWarning)
import pandas as pd
import seaborn as sns
import numpy as np
from scipy.stats import pearsonr
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display_html
sns.set_style('darkgrid')

## Functions

In [None]:

def display_dfs(dfs, gap=50, justify='center'):
    html = ""
    for title, df in dfs.items():  
        df_html = df._repr_html_()
        cur_html = f'<div> <h3>{title}</h3> {df_html}</div>'
        html +=  cur_html
    html= f"""
    <div style="display:flex; gap:{gap}px; justify-content:{justify};">
        {html}
    </div>
    """
    display_html(html, raw=True)

def mynorm(arr):
    return (arr-arr.min())/(arr.max() - arr.min())


def remove_outliers(d,cols_=['y']):
    d=d.copy()
    cols = cols_ # one or more

    Q1 = d[cols].quantile(0.25)
    Q3 = d[cols].quantile(0.75)
    IQR = Q3 - Q1

    d = d[~((d[cols] < (Q1 - 2 * IQR)) |(d[cols] > (Q3 + 2 * IQR))).any(axis=1)]
    d['y'] = mynorm(d.y)
    return d


def plot_corr(df,var):
    sns.regplot(x=mynorm(df[var]),y=mynorm(df.y),order=1)
    plt.show()



def calculate_pvalues(df):
    dfcols = pd.DataFrame(columns=df.columns)
    pvalues = dfcols.transpose().join(dfcols, how='outer')
    for r in df.columns:
        for c in df.columns:
            tmp = df[df[r].notnull() & df[c].notnull()]
            pvalues[r][c] = round(pearsonr(tmp[r], tmp[c])[1], 4)
    return pvalues

## Load

In [None]:
d=pd.read_csv('../datasets/datasets_final.csv')
d['seo_subj'] = d[['seo_subj','seo_subj2']].mean(axis=1)
d=d[['id','seo_complex2','seo_veracity','seo_loaded','seo_mb',
   'seo_pers','seo_sento','seo_subj',
   'publish_date_cest','channel','access_level','seo_title','y','body_mb','body_pers']]

In [None]:
d=d.rename(columns={'seo_complex2':'language_complexity','seo_veracity':'inversed_veracity','seo_loaded':'loaded_language',
                  'seo_mb':'generalized_bias','seo_pers':'persuasivness','seo_sento':'sentiment','seo_subj':'subjective_bias',
                  'body_mb':'body_bias_score','body_pers':'body_persuasive_score'})

In [None]:
df = d[d.channel.isin(list(d.groupby('channel').count().sort_values(by='id').reset_index()['channel'][-14:]))]
df = d.copy()
df = df[df.access_level != 'conditional']
# df.y = mynorm(df.y)


# Different setups

## granularity

In [None]:
df['title_bias_metric']=df[['loaded_language','subjective_bias','generalized_bias','persuasivness','inversed_veracity']].mean(axis=1)
df['body_bias_metric']=df[['body_bias_score','body_persuasive_score']].mean(axis=1)

In [None]:
df['month'] = df.publish_date_cest.apply(lambda x: str(x)[5:7])
df['day'] = df.publish_date_cest.apply(lambda x: str(x)[:10])

df_sorted=df.sort_values(by='day')
df_sorted_f=df_sorted[df_sorted.access_level=='free'].groupby('day').mean()
df_sorted_p=df_sorted[df_sorted.access_level=='paid'].groupby('day').mean()

In [None]:
aus=df[df.channel == 'Ausland'].sort_values(by='day')

In [None]:
weeks = []
week_counter = 0
sunday = False
for idx, row in aus.iterrows():
    day = int(row['day'][-2:])
    if day%7 == 0 and not sunday:
        week_counter+=1
        sunday = True
    if day%7 != 0 and sunday:
        sunday = False
    weeks.append(week_counter)

In [None]:
aus['week'] = weeks

In [None]:
aus['y'] = mynorm(aus.y)

In [None]:
mynorm(aus.groupby('week').mean().sort_values(by='inversed_veracity'))

In [None]:
aus[aus.week.isin([20,21,22])]

## detour

In [None]:
stats = df.groupby(by='month').agg(['mean','sem'])
d1 = stats['seo_complex2']
d1['ci95_hi'] = d1['mean'] + 1.96* d1['sem']
d1['ci95_lo'] = d1['mean'] - 1.96* d1['sem']
import matplotlib.pyplot as plt
sns.lineplot(x=np.arange(len(d1['mean'])),y=d1['mean'],ci=None)
plt.fill_between(np.arange(len(d1['mean'])),d1['ci95_hi'], d1['ci95_lo'], color='blue', alpha=0.1)


## grouping

In [None]:
gbd_all = df.groupby(by='day').mean()
gbm_all = df.groupby(by='month').mean()
gbw_all = df_sorted.groupby(np.arange(len(df_sorted))//7, axis=0).mean()
gbmp_all = df[df.access_level=='paid'].groupby(by='month').mean()
gbdp_all = df[df.access_level=='paid'].groupby(by='day').mean()
gbmf_all = df[df.access_level=='free'].groupby(by='month').mean()
gbdf_all = df[df.access_level=='free'].groupby(by='day').mean()
gbwf_all =df_sorted_f.groupby(np.arange(len(df_sorted_f.index))//7, axis=0).mean()
gbwp_all = df_sorted_p.groupby(np.arange(len(df_sorted_p.index))//7, axis=0).mean()

In [None]:
def plot_by_topic(d):
    d['month'] = d.publish_date_cest.apply(lambda x: str(x)[5:7])
    d['day'] = d.publish_date_cest.apply(lambda x: str(x)[:10])

    topics = d.groupby('channel').count().sort_values(by='id').reset_index()['channel'][-14:]

    for topic in topics:
        print(topic)
        
        df = d[d.channel == topic]
        print("Full length:",len(df))


        df_sorted=df.sort_values(by='day')
        df_sorted_f=df_sorted[df_sorted.access_level=='free'].groupby('day').mean()
        df_sorted_p=df_sorted[df_sorted.access_level=='paid'].groupby('day').mean()
        print("Free to paid ratio:",len(df_sorted[df_sorted.access_level=='free'])/(len(df_sorted[df_sorted.access_level=='free'])+len(df_sorted[df_sorted.access_level=='paid'])))
        df_sorted_fp = df_sorted.groupby('day').mean()


        gbwf =df_sorted_f.groupby(np.arange(len(df_sorted_f.index))//7, axis=0).mean()
        gbwp = df_sorted_p.groupby(np.arange(len(df_sorted_p.index))//7, axis=0).mean()
        gbw = df_sorted_fp.groupby(np.arange(len(df_sorted_fp.index))//7, axis=0).mean()
        gbwf_corrs = remove_outliers(gbwf).corr().y.sort_values().reset_index()
        gbwp_corrs = remove_outliers(gbwp).corr().y.sort_values().reset_index()
        gbw_corrs = remove_outliers(gbw).corr().y.sort_values().reset_index()
        print("Final number of months:",len(remove_outliers(gbw)))

        print(np.corrcoef(remove_outliers(gbwf).corr().y,remove_outliers(gbwp).corr().y)[0][1])

        dfs = {'free':gbwf_corrs,'paid':gbwp_corrs, 'both':gbw_corrs}
        display_dfs(dfs, justify='flex-start')

In [None]:
def plot_by_topic2(d):
    d['month'] = d.publish_date_cest.apply(lambda x: str(x)[5:7])
    d['day'] = d.publish_date_cest.apply(lambda x: str(x)[:10])

    # topics = d.groupby('channel').count().sort_values(by='id').reset_index()['channel'][-14:]
    topics = ['Wissenschaft','Ausland','Psychologie']

    for topic in topics:
        print(topic)
        
        df = d[d.channel == topic]
        print("Full length:",len(df))


        df_sorted=df.sort_values(by='day')
        gbwf=bla(remove_outliers(df_sorted[df_sorted.access_level=='free'].groupby('day').mean()))
        gbwp=bla(remove_outliers(df_sorted[df_sorted.access_level=='paid'].groupby('day').mean()))
        print("Free to paid ratio:",len(df_sorted[df_sorted.access_level=='free'])/(len(df_sorted[df_sorted.access_level=='free'])+len(df_sorted[df_sorted.access_level=='paid'])))
        gbw =bla(remove_outliers(df_sorted.groupby('day').mean()))


        gbwf_corrs = gbwf.corr().y.sort_values().reset_index()
        gbwp_corrs = gbwp.corr().y.sort_values().reset_index()
        gbw_corrs = gbw.corr().y.sort_values().reset_index()
        print("Final number of months:",len(gbw))

        print(np.corrcoef(gbwf.corr().y,gbwp.corr().y)[0][1])

        dfs = {'free':gbwf_corrs,'paid':gbwp_corrs, 'both':gbw_corrs}
        display_dfs(dfs, justify='flex-start')

In [None]:
def get_corefs(d):
    d['month'] = d.publish_date_cest.apply(lambda x: str(x)[5:7])
    d['day'] = d.publish_date_cest.apply(lambda x: str(x)[:10])

    topics = d.groupby('channel').count().sort_values(by='id').reset_index()['channel'][-14:]
    corefs=[]
    for topic in topics:        
        df = d[d.channel == topic]
        df_sorted=df.sort_values(by='day')
        df_sorted_f=df_sorted[df_sorted.access_level=='free'].groupby('day').mean()
        df_sorted_p=df_sorted[df_sorted.access_level=='paid'].groupby('day').mean()


        gbwf =df_sorted_f.groupby(np.arange(len(df_sorted_f.index))//7, axis=0).mean()
        gbwp = df_sorted_p.groupby(np.arange(len(df_sorted_p.index))//7, axis=0).mean()

        corefs.append({'topic':topic,'corr':np.corrcoef(remove_outliers(gbwf).corr().y,remove_outliers(gbwp).corr().y)[0][1]})

    return pd.DataFrame(corefs)

In [None]:
sns.barplot(data=get_corefs(df).sort_values(by='corr')[:-1],x='topic',y='corr')
plt.xticks(rotation=45)

In [None]:
def get_topic_week(df,topic):
    df = df[df.channel == topic]

    df_sorted=df.sort_values(by='day')
    df_sorted_f=df_sorted[df_sorted.access_level=='free'].groupby('day').mean()
    df_sorted_p=df_sorted[df_sorted.access_level=='paid'].groupby('day').mean()
    df_sorted_fp = df_sorted.groupby('day').mean()


    gbwf = remove_outliers(df_sorted_f.groupby(np.arange(len(df_sorted_f.index))//7, axis=0).mean())
    gbwp = remove_outliers(df_sorted_p.groupby(np.arange(len(df_sorted_p.index))//7, axis=0).mean())
    gbw = remove_outliers(df_sorted_fp.groupby(np.arange(len(df_sorted_fp.index))//7, axis=0).mean())

    return gbwf,gbwp,gbw

In [None]:
def get_topic_month(df,topic):
    df = df[df.channel == topic]

    gbwf = remove_outliers(df[df.access_level=='free'].groupby('month').mean())
    gbwp = remove_outliers(df[df.access_level=='paid'].groupby('month').mean())
    gbw = remove_outliers(df.groupby("month").mean())

    return gbwf,gbwp,gbw

In [None]:
def get_topic_day(df,topic):
    df = df[df.channel == topic]

    gbwf = bla(remove_outliers(df[df.access_level=='free'].groupby('day').mean()))
    gbwp = bla(remove_outliers(df[df.access_level=='paid'].groupby('day').mean()))
    gbw = bla(remove_outliers(df.groupby("day").mean()))

    return gbwf,gbwp,gbw

In [None]:
    def bla(d):
        return mynorm(d.rolling(window=10).mean())

In [None]:
plot_by_topic2(df)

## Ausland

In [None]:
def bla(d):
    return mynorm(d.ewm(alpha=0.1).mean())

In [None]:
gbwf,gbwp,gbw = get_topic_day(df,'Ausland')
mask=(calculate_pvalues(gbw)>0.05).astype(int)
sns.heatmap(gbw.corr(),mask=mask,annot=True)

In [None]:
gbw.corr().y.sort_values()

In [None]:
sns.lineplot(gbwf.corr().y.drop('y'),marker='o')
sns.lineplot(gbwp.corr().y.drop('y'),marker='o')
plt.xticks(rotation=45)


In [None]:
sns.lineplot(gbw['y'])
sns.lineplot(gbw['subjective_bias'],color='#F29492')
plt.xticks([])
plt.ylabel(None)
plt.xlabel('days')
# sns.lineplot(mynorm(gbw['seo_subj'].ewm(alpha=0.05).mean()))

In [None]:
gbw.columns

In [None]:
sns.regplot(x=mynorm(gbwf['subjective_bias']),y=mynorm(gbwf.y),order=1)
plt.ylabel('#pageviews')
plt.xlabel("bias of the article")

# plot_corr(gbwf,'subjective_bias')

In [None]:
sns.set_style("darkgrid", {'grid.linestyle': '--'})
sns.lineplot(gbwf['y'])
sns.lineplot(gbwf['body_bias_metric'],color='#F29492')
plt.xticks([])

# sns.lineplot(mynorm(gbw['seo_subj'].ewm(alpha=0.05).mean()))
plt.show()

## Wissenschaft

In [None]:
gbwf,gbwp,gbw = get_topic_day(df,'Wissenschaft')

sns.lineplot(gbw['y'])
# sns.lineplot(gbw['y'],alpha=0.2)
sns.lineplot(gbw['body_persuasive_score'],color='#F29492')
# sns.lineplot(gbw['body_bias_metric'],alpha=0.2)
plt.xticks([])
plt.ylabel(None)
plt.xlabel(None)

# sns.lineplot(gbw['seo_title_multibias'])

In [None]:
gbwf,gbwp,gbw = get_topic_day(df,'Wissenschaft')

sns.lineplot(gbw['y'])
# sns.lineplot(gbw['y'],alpha=0.2)
sns.lineplot(gbw['language_complexity'],color='#F29492')
# sns.lineplot(gbw['body_bias_metric'],alpha=0.2)
plt.xticks([])
plt.ylabel(None)
plt.xlabel(None)

# sns.lineplot(gbw['seo_title_multibias'])

In [None]:
gbwf,gbwp,gbw = get_topic_day(df,'Psychologie')

sns.lineplot(gbw['y'])
# sns.lineplot(gbw['y'],alpha=0.2)
sns.lineplot(mynorm(gbw['language_complexity']),color='#F29492')
# sns.lineplot(gbw['body_bias_metric'],alpha=0.2)
plt.xticks([])
# sns.lineplot(gbw['seo_title_multibias'])

In [None]:
gbwf,gbwp,gbw = get_topic_day(df,'Psychologie')

sns.lineplot(gbw['y'])
# sns.lineplot(gbw['y'],alpha=0.2)
sns.lineplot(gbw['language_complexity'],color='#F29492')
# sns.lineplot(gbw['body_bias_metric'],alpha=0.2)
plt.xticks([])

# sns.lineplot(gbw['seo_title_multibias'])

## Psychologie

In [None]:
gbwf,gbwp,gbw = get_topic_day(df,'Psychologie')

sns.lineplot(gbw['y'].rolling(window=5).mean())
# sns.lineplot(gbw['y'],alpha=0.2)
sns.lineplot(gbw['body_bias_metric'].rolling(window=5).mean())
# sns.lineplot(gbw['body_bias_metric'],alpha=0.2)
plt.xticks([])

# sns.lineplot(gbw['seo_title_multibias'])

d

In [None]:
sns.lineplot(gbwf.corr().y.drop('y'),marker='o')
sns.lineplot(gbwp.corr().y.drop('y'),marker='o')
plt.xticks(rotation=45)

In [None]:
sns.lineplot(gbwp['y'])
sns.lineplot(gbwp['language_complexity'])
plt.xticks([])

In [None]:
sns.lineplot(gbw['y'])
sns.lineplot(gbw['body_bias_metric'])
plt.xticks([])

In [None]:
sns.lineplot(mynorm(gbw['y'].ewm(alpha=0.1).mean()))
sns.lineplot(mynorm(gbw['seo_veracity'].ewm(alpha=0.1).mean()))


In [None]:
sns.lineplot(mynorm(gbd_all['y'].ewm(alpha=0.01).mean()))
sns.lineplot(mynorm(gbd_all['inversed_veracity'].ewm(alpha=0.01).mean()))


In [None]:
def bla(d):
    return mynorm(d.ewm(alpha=0.01).mean())

In [None]:
gbd_all_bla = bla(gbd_all)

In [None]:
gbd_all_bla.corr().y.sort_values()

In [None]:
gbd_all_bla = remove_outliers(gbd_all_bla)

In [None]:
sns.lineplot(gbd_all_bla['y'])
sns.lineplot(gbd_all_bla['language_complexity'],legend=True,color='#F29492')
plt.xticks([])
plt.ylabel('')
plt.xlabel('days')
# plt.savefig('../images/bb.png',dpi=300)


In [None]:
import os
os.getcwd()

In [None]:
sns.regplot(x=mynorm(gbd_all_bla['language_complexity']),y=mynorm(gbd_all_bla.y),order=1)
plt.ylabel('#pageviews')
plt.xlabel("bias of the article")

In [None]:
blabla.columns

In [None]:
sns.lineplot(blabla['y'][10:])
sns.lineplot(blabla['body_bias_score'][10:])


In [None]:
sns.lineplot(bla(df[df.access_level=='free'])['y'])
sns.lineplot(bla(df[df.access_level=='free'])['body_bias_score'])


In [None]:
mynorm(gbdp_all['y'].ewm(alpha=0.1).mean())

In [None]:
bla(df).corr().y.sort_values()

In [None]:
(bla(gbd_all)).corr().y.sort_values()

In [None]:
sns.lineplot(bla(gbd_all)['y'][1:-1])
sns.lineplot(bla(gbd_all)['language_complexity'][1:-1])
plt.xticks([])

In [None]:
(bla(gbdp_all)).corr().y.sort_values()

In [None]:
(bla(gbdp_all)).corr().y.sort_values()

In [None]:
topics = d.groupby('channel').count().sort_values(by='id').reset_index()['channel'][-14:]
ddd = pd.DataFrame()

for t in topics:
    _,dd,_ = get_topic_day(df,t)
    ddd[t] = dd.corr().y


In [None]:
gbdp_all.corr().y.abs().mean()

In [None]:
ddd.abs().to_numpy().reshape(12*14).mean()

In [None]:
ddd.abs().fillna(0).to_numpy().reshape(12*14).mean()

In [None]:
np.corrcoef(bla(gbdf_all).corr().y,bla(gbdp_all).corr().y)

In [None]:
sns.heatmap(ddd.corr(),annot=True)

In [None]:
gbd_all

## Fitting

In [None]:
d1 = df[['seo_complex2', 'seo_veracity', 'seo_loaded', 'seo_mb',
       'seo_pers', 'seo_sento', 'seo_subj', 'channel',
       'access_level', 'y', 'body_mb', 'body_pers',
       'seo_title_multibias', 'body_bias_metric']]

In [None]:
d1.channel

In [None]:
data=d1[d1.channel=='Wissenschaft'].drop(columns=['access_level','channel'])

In [None]:
from sklearn.tree import DecisionTreeRegressor
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_squared_error
from copy import deepcopy
features = mynorm(data.drop(columns=['y']))
labels = mynorm(data['y'])

max_score = 0
best_clf = None
for i in tqdm(range(1000)):
    X_train,X_dev,y_train,y_dev = train_test_split(features,labels,test_size=0.3)
    X_dev,X_test,y_dev,y_test = train_test_split(X_dev,y_dev,test_size=0.5)
    clf = DecisionTreeRegressor(max_depth=3,random_state=i)
    clf.fit(features,labels)

    result = r2_score(y_dev,clf.predict(X_dev))
    if result > max_score:
        print(result)
        max_score = result
        best_clf = deepcopy(clf)
        print("on test",r2_score(y_test,clf.predict(X_test)))
            

    

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import LeaveOneOut

def LOOCV(clf,train_data):
    loo = LeaveOneOut()

    # initialize a list to store the scores
    scores = []

    X = train_data.drop(columns=["y"])
    y = train_data.y
    # iterate over the splits

    preds = []
    gt = []
    for train_index, test_index in tqdm(loo.split(X)):
        # get the training and test data for this split
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # fit the model on the training data and score it on the test data
        clf.fit(X_train, y_train)
        preds.append(clf.predict(X_test))
        gt.append(y_test)
        

    # calculate and print the mean score
    return r2_score(gt,preds)

In [None]:
clf = DecisionTreeRegressor(max_depth=3,random_state=i)
LOOCV(clf,gbwp)


In [None]:
gbwf,gbwp,gbw = get_topic_week(df,'Ausland')

In [None]:
gbwf