In [1]:
import dask
import dask.dataframe as ddf
import os
from glob import glob
import re
import pandas as pd
import numpy as np
import time
import mwparserfromhell
import string
import lda
from sklearn.feature_extraction.text import CountVectorizer
from numpy import savetxt
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from itertools import combinations, product, combinations_with_replacement
import functools

  import pandas.util.testing as tm


In [2]:
parquetdir = '../../tcm-columns-add-main/'

In [3]:
page_title = ddf.read_parquet(os.path.join(parquetdir,'page.title')).compute().drop(columns={'dir0'})
# for each article, get the most recent content
revision_text = ddf.read_parquet(os.path.join(parquetdir,'revision.text')).compute().drop(columns={'dir0'})
timestamp = ddf.read_parquet(os.path.join(parquetdir,'revision.timestamp')).compute().drop(columns={'dir0'})
page_info = page_title.join(revision_text).join(timestamp)

# Get and process the most curent content for each page

In [4]:
max_index = page_info.groupby('page.title').agg({'revision.timestamp':'max'})

In [5]:
current_page = page_info.merge(max_index.reset_index(),on=['page.title','revision.timestamp'])

In [6]:
current_page = current_page.drop(columns={'revision.timestamp'})

In [7]:
def is_talk(text):
    return re.search('Talk:', text) != None

current_page['is_talk'] = current_page['page.title'].apply(is_talk)

In [8]:
current_articles = current_page[current_page['is_talk'] == False]
current_articles = current_articles.drop(columns={'is_talk'})

In [9]:
def get_content(text):
    parsed_wikicode = mwparserfromhell.parse(text)
    text = parsed_wikicode.strip_code()
    # replace hyphen with space
    text = text.replace('-',' ') 
    text = [t for t in text if t not in string.punctuation]
    return ''.join([i for i in text if not i.isdigit()])

In [10]:
current_articles['processed_text'] = current_articles['revision.text'].map(get_content)

# create lda model

In [12]:
n_topics = 100
model = lda.LDA(n_topics=n_topics, n_iter=100,alpha=50/n_topics,eta=0.1)

In [13]:
def my_tokenizer(text):
    text = text.replace('\n','')
    text = re.split(r'([a-zA-Z1-9]+)', text)
    # remove words whose length is less than 3
    text = [t.strip() for t in text if len(t.strip()) > 2]
    return text

In [14]:
vectorizer = CountVectorizer(min_df=5/len(current_articles),stop_words='english',tokenizer=lambda text: my_tokenizer(text))

In [15]:
content = current_articles['processed_text'].to_list()

In [16]:
X = vectorizer.fit_transform(content)

In [17]:
vocab = vectorizer.get_feature_names()



In [18]:
model.fit(X)

INFO:lda:n_documents: 459
INFO:lda:vocab_size: 6709
INFO:lda:n_words: 234114
INFO:lda:n_topics: 100
INFO:lda:n_iter: 100
INFO:lda:<0> log likelihood: -3152707
INFO:lda:<10> log likelihood: -2176113
INFO:lda:<20> log likelihood: -2094866
INFO:lda:<30> log likelihood: -2072047
INFO:lda:<40> log likelihood: -2059507
INFO:lda:<50> log likelihood: -2052623
INFO:lda:<60> log likelihood: -2046361
INFO:lda:<70> log likelihood: -2044037
INFO:lda:<80> log likelihood: -2041060
INFO:lda:<90> log likelihood: -2036966
INFO:lda:<99> log likelihood: -2034937


<lda.lda.LDA at 0x7f7e20fa1580>

In [19]:
topic_word = model.topic_word_

In [20]:
n_top_words = 10

In [21]:
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: zhang wang day journal sichuan having district myths county given
Topic 1: legs water group company brand cooling singapore ken wen product
Topic 2: tea sinensis leaves small chinese assam leaf camellia caffeine teas
Topic 3: cultivation says practice zhuan teachings universe human body science ownby
Topic 4: ginkgo trees tree short years fossil shoots slow living environments
Topic 5: skin powder pearl blister beetles preparations beetle book aphrodisiac treat
Topic 6: acupuncture medicine oriental certification commission national state nccaom professional program
Topic 7: deer penis velvet spray new thought annually consumed athletes banned
Topic 8: seeds crop seed bean lotus cultivated india nepal foods various
Topic 9: falun gong china practitioners chinese communist practice qigong persecution rights
Topic 10: goji snake products united traditional barbarum food chinese china fda
Topic 11: vine honeysuckle silver alternative glycosides herb significantly slightly continu

In [22]:
doc_topic = model.doc_topic_

In [23]:
savetxt('../../intermediate-result/TCM/topic_distribution.csv', doc_topic, delimiter=',')

In [24]:
titles = current_articles['page.title'].to_list()

In [25]:
with open("../../intermediate-result/TCM/titles.txt", "wb") as fp:
    pickle.dump(titles, fp)

# get cc score

In [27]:
c = pd.read_parquet('../../result/TCM/TCM-CScore-article.parquet')['estimated_crc'].to_dict()
c_n = pd.read_parquet('../../result/TCM/TCM-normalized-CScore-article.parquet')['estimated_crc_normalized'].to_dict()

In [28]:
article_info = pd.read_parquet('../../intermediate-result/TCM/TCM-article-info.parquet')
revision_count = article_info['revision.count'].to_dict()

In [29]:
# use sig contributions

In [30]:
n = len(titles)
page_similarity = np.zeros([n,n])
for i in range(len(doc_topic)):
    topic = doc_topic[i]
    page_similarity[i,:] = cosine_similarity([topic],doc_topic)[0]

In [31]:
page_similarity_df = pd.DataFrame(page_similarity,index=titles,columns=titles)

In [32]:
user_contribution = pd.read_parquet('../../intermediate-result/TCM/sig-contrib-info')

In [33]:
def is_talk(text):
    return re.search('Talk:', text) != None

In [34]:
user_contribution = user_contribution[~user_contribution['page.title'].apply(is_talk)]

In [35]:
user_contribution = user_contribution.set_index(['contributor.username','page.title'])

In [36]:
user_contribution = user_contribution.groupby(level=0).apply(lambda df: df.xs(df.name).to_dict()['sig.contributions']).to_dict()

In [37]:
l = pd.DataFrame(user_contribution).fillna(0).T
ln = pd.DataFrame(user_contribution).fillna(0).T

In [38]:
for title in l.columns:
    l[title] = l[title] / revision_count[title] * c[title]
    ln[title] = ln[title] / revision_count[title] * c_n[title]

In [39]:
clust_n = {}
clust_d = {}
clust_n_norm = {}
clust_d_norm = {}
def compute_clust(t_pairs,title,user):        
    n = l[t_pairs[0]][user] * l[t_pairs[1]][user] * page_similarity_df[title][t_pairs[0]] \
        * page_similarity_df[title][t_pairs[1]] * page_similarity_df[t_pairs[0]][t_pairs[1]]
    d = l[t_pairs[0]][user] * l[t_pairs[1]][user] * page_similarity_df[title][t_pairs[0]] \
        * page_similarity_df[title][t_pairs[1]]
    
    n_norm = ln[t_pairs[0]][user] * ln[t_pairs[1]][user] * page_similarity_df[title][t_pairs[0]] \
        * page_similarity_df[title][t_pairs[1]] * page_similarity_df[t_pairs[0]][t_pairs[1]]
    d_norm = ln[t_pairs[0]][user] * ln[t_pairs[1]][user] * page_similarity_df[title][t_pairs[0]] \
        * page_similarity_df[title][t_pairs[1]]
    if title not in clust_n:
        clust_n[title] = n
        clust_d[title] = d
        clust_n_norm[title] = n_norm
        clust_d_norm[title] = d_norm
    else:
        clust_n[title] += n
        clust_d[title] += d
        
        clust_n_norm[title] += n_norm
        clust_d_norm[title] += d_norm

In [40]:
selected_editors = pd.read_parquet('../../intermediate-result/TCM/editors-with-sig-contrib-at-least-10').index

In [41]:
cc_score = {}
cc_score_norm = {}
for user in selected_editors:
    # user does not contribute to page
    if user not in user_contribution:
        cc_score[user] = 0
        cc_score_norm[user] = 0
        continue
    titles_of_user = l.loc[user]
    # only look at titles with l greater than 0, since if l is 0, then product is also 0
    # so it does not affect the cc score of current user
    titles_of_user = titles_of_user[titles_of_user > 0]

    # get all possible pairs of articles
    all_possible_combinations  = [p for p in product(titles_of_user.index, repeat=2)]
    for title in titles_of_user.index:     
        compute_title = functools.partial(compute_clust,title = title,user = user)
        list(map(compute_title,all_possible_combinations))

        
    clust = {k: float(clust_n[k])/clust_d[k] for k in clust_n}


    clust_df = pd.DataFrame(clust.items(),columns=['title','clust']).set_index('title')
    clust_df['cc'] = clust_df['clust'] * l.loc[user][clust_df.index]
    cc_score[user] = clust_df['cc'].sum()

    clust_n = {}
    clust_d = {}
    
    
    
    clust_norm = {k: float(clust_n_norm[k])/clust_d_norm[k] for k in clust_n_norm}


    clust_norm_df = pd.DataFrame(clust_norm.items(),columns=['title','clust']).set_index('title')
    clust_norm_df['cc_norm'] = clust_norm_df['clust'] * ln.loc[user][clust_df.index]
    cc_score_norm[user] = clust_norm_df['cc_norm'].sum()
    
    clust_n_norm = {}
    clust_d_norm = {}


In [42]:
cc_score_df = pd.DataFrame(cc_score.items(), columns=['contributor', 'cc_score']).set_index('contributor')
cc_score_norm_df = pd.DataFrame(cc_score_norm.items(), columns=['contributor', 'cc_score_norm']).set_index('contributor')

In [44]:
cc_score_df.to_csv("../../result/TCM/TCM-CCScore-user.tsv", sep="\t",encoding='utf-16')

In [45]:
cc_score_norm_df.to_csv("../../result/TCM/TCM-normalized-CCScore-user.tsv", sep="\t",encoding='utf-16')

In [46]:
cc_score_df.to_parquet('../../result/TCM/TCM-CC-Score')

In [47]:
cc_score_norm_df.to_parquet('../../result/TCM/TCM-normalized-CC-Score')