# Visualizations of the Wikipedia Talk Pages corpus

Source: https://convokit.cornell.edu/documentation/wiki.html


## Import modules

In [None]:
!pip install convokit
!git clone https://github.com/jbruneaubongard/linguistic_patterns_wikipedia.git

In [None]:
from convokit import Corpus, download
from datetime import datetime, timedelta

from script.data_viz.utils import *

## Preprocessing

In [None]:
corpus = Corpus(filename=download('wiki-corpus'))
df_utt = corpus.get_utterances_dataframe()
df_utt['timestamp'] = df_utt['timestamp'].apply(lambda x: float(x))

# Convert timestamp to datetime
df_utt["date"] = pd.to_datetime(df_utt["timestamp"], unit='s')

# Delete data with inconsistent date
df_utt = df_utt[(df_utt['date'].apply(lambda x: x.year) < 2014) & (df_utt['date'].apply(lambda x: x.year) > 2000)]

# Dataframes of admin / non-admin

df_admin = df_utt[df_utt['meta.is-admin'].apply(lambda x: x)]
df_non_admin = df_utt[df_utt['meta.is-admin'].apply(lambda x: not x)]

## Histogram of utterances

In [None]:
start_date = '2003-04-01'
end_date = '2011-08-09'

bin_size_months = 3

get_histogram(df_admin, df_non_admin, start_date, end_date, bin_size_months)

## Sankey diagram for speakers

In [None]:
start_date = '2003-04-01'
end_date = '2011-08-09'

In [None]:
get_sankey_plot(df_utt, start_date, end_date, 92, [1,2, 3, 4, 5])

In [None]:
get_sankey_plot(df_utt, start_date, end_date, 183, [1,2,3,4,5])