In [1]:
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [2]:
articles = pd.read_csv('../data/articles.csv')
df = pd.read_csv('../data/articles-with-topics.csv')

In [3]:
articles['time'] = pd.to_datetime(articles['time'])
df['time'] = pd.to_datetime(df['time'])

In [4]:
df.head()

Unnamed: 0,by,descendants,id,kids,score,time,title,type,url,topic
0,todsacerdoti,607,24563698,24563821 24563987 24565829 24564488 24563843 2...,711,2020-09-23 08:38:33,Firefox usage is down 85% despite Mozilla's to...,story,http://calpaterson.com/mozilla.html,People words
1,themodelplumber,126,24562850,24563705 24563838 24563276 24563478 24565992 2...,310,2020-09-23 06:04:45,"How to say no, for the people pleaser who alwa...",story,https://www.npr.org/2020/09/15/913207918/how-t...,Linux
2,luu,27,24553085,24563230 24562887 24562049 24564895 24562235 2...,227,2020-09-22 13:04:22,Small Tech,story,https://scattered-thoughts.net/writing/small-t...,People and ideas
3,pseudolus,56,24553369,24563122 24565813 24562283 24562375 24563752 2...,94,2020-09-22 13:47:02,You Talkin’ to Me? The Unruly History of New Y...,story,https://literaryreview.co.uk/tawk-of-the-town,"Literature, culture, philosophy"
4,jeanyang,24,24561119,24561495 24562538 24565366 24564334 24561573 2...,87,2020-09-23 01:13:58,Catch Breaking Changes by Diffing API Traffic,story,https://www.akitasoftware.com/blog/2020/9/22/f...,People and ideas


# Number of articles in time

In [57]:
plot_data_articles = articles.groupby([pd.Grouper(key = 'time', freq = 'M')])['score'].count().reset_index()
plot_data_articles_topic = df.groupby([pd.Grouper(key = 'time', freq = 'M')])['score'].count().reset_index()

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = plot_data_articles['time'],
        y = plot_data_articles['score'],
        name = 'Articles'
    )
)

fig.add_trace(
    go.Scatter(
        x = plot_data_articles_topic['time'],
        y = plot_data_articles_topic['score'],
        name = 'Articles with content'
    )
)

fig.update_layout(
    title = 'Number of articles in time'
)

fig.write_image('results/plots-articles-with-topics/articles-in-time.png')

# Number of articles per topic

In [56]:
plot_data = df['topic'].value_counts().reset_index()

fig = px.bar(
    plot_data,
    x = 'index',
    y = 'topic',
    title = 'Number of articles per topic'
)

fig.update_layout(
    xaxis = {
        'title': 'Topic',
        'categoryorder': 'total descending'
    },
    yaxis = {
        'title': 'Number of articles'
    }
)

fig.write_image('results/plots-articles-with-topics/article-number-topic.png')

# Avg. number of upvotes by topic

In [37]:
plot_data = df.groupby(['topic'])['score'].mean().reset_index()

fig = px.bar(
    plot_data,
    x = 'topic',
    y = 'score',
    title = 'Average number of upvotes by topic'
)

fig.update_layout(
    xaxis = {
        'title': 'Topic',
        'categoryorder': 'total descending'
    },
    yaxis = {
        'title': 'Average number of upvotes'
    }
)

fig.write_image('results/plots-articles-with-topics/avg-upvotes.png')

# Avg. number of comments by topic

In [38]:
plot_data = df.groupby(['topic'])['descendants'].mean().reset_index()

fig = px.bar(
    plot_data,
    x = 'topic',
    y = 'descendants',
    title = 'Average number of comments by topic'
)

fig.update_layout(
    xaxis = {
        'title': 'Topic',
        'categoryorder': 'total descending'
    },
    yaxis = {
        'title': 'Average number of comments'
    }
)

fig.write_image('results/plots-articles-with-topics/avg-comments.png')

In [68]:
df['by'].value_counts().head(20)

todsacerdoti    82
pseudolus       79
luu             57
rbanffy         54
ingve           49
tosh            47
bookofjoe       45
hhs             43
lelf            37
dnetesn         34
zdw             31
prostoalex      31
apsec112        27
caution         27
pcr910303       26
polm23          26
samizdis        24
doener          23
Hooke           22
fortran77       21
Name: by, dtype: int64

In [85]:
df.groupby(['by'])['topic'].nunique().reset_index()['topic'].value_counts()

1     2619
2      338
3      117
4       52
5       25
6       19
7       18
9       10
8        9
12       5
10       5
13       4
11       4
15       1
16       1
14       1
17       1
Name: topic, dtype: int64

In [86]:
df

Unnamed: 0,by,descendants,id,kids,score,time,title,type,url,topic
0,todsacerdoti,607,24563698,24563821 24563987 24565829 24564488 24563843 2...,711,2020-09-23 08:38:33,Firefox usage is down 85% despite Mozilla's to...,story,http://calpaterson.com/mozilla.html,People words
1,themodelplumber,126,24562850,24563705 24563838 24563276 24563478 24565992 2...,310,2020-09-23 06:04:45,"How to say no, for the people pleaser who alwa...",story,https://www.npr.org/2020/09/15/913207918/how-t...,Linux
2,luu,27,24553085,24563230 24562887 24562049 24564895 24562235 2...,227,2020-09-22 13:04:22,Small Tech,story,https://scattered-thoughts.net/writing/small-t...,People and ideas
3,pseudolus,56,24553369,24563122 24565813 24562283 24562375 24563752 2...,94,2020-09-22 13:47:02,You Talkin’ to Me? The Unruly History of New Y...,story,https://literaryreview.co.uk/tawk-of-the-town,"Literature, culture, philosophy"
4,jeanyang,24,24561119,24561495 24562538 24565366 24564334 24561573 2...,87,2020-09-23 01:13:58,Catch Breaking Changes by Diffing API Traffic,story,https://www.akitasoftware.com/blog/2020/9/22/f...,People and ideas
...,...,...,...,...,...,...,...,...,...,...
5907,laurex,7,24529486,24529930 24529758 24534269,19,2020-09-19 20:29:33,PalanThiel: The Uncola,story,https://www.profgalloway.com/palanthiel-the-un...,Software
5908,anarbadalov,11,24486664,24488089 24487725 24489507 24489373 24488362,49,2020-09-15 23:01:01,The Magical Art of Selling Soap,story,https://www.laphamsquarterly.org/roundtable/ma...,"Literature, culture, philosophy"
5909,herval,171,23543043,23545567 23545267 23545039 23543552 23544350 2...,804,2020-06-16 21:26:47,Drive through cities in the browser while list...,story,https://driveandlisten.herokuapp.com/,JavaScript
5910,mmastrac,21,23466431,23466816 23467054 23468305 23467350 23471899 2...,111,2020-06-09 15:23:35,Microship.com: The Library of Technomadics,story,https://microship.com/,Wordpress


In [147]:
unique_authors_topics = df[['by', 'topic']].drop_duplicates()

In [148]:
unique_authors_topics

Unnamed: 0,by,topic
0,todsacerdoti,People words
1,themodelplumber,Linux
2,luu,People and ideas
3,pseudolus,"Literature, culture, philosophy"
4,jeanyang,People and ideas
...,...,...
5904,palo3,Linux
5905,cube00,Compilers
5907,laurex,Software
5909,herval,JavaScript


In [151]:
authors_number_different_topics = unique_authors_topics.groupby(['by'])['topic'].nunique().reset_index()

In [155]:
authors_with_one_topic = authors_number_different_topics[authors_number_different_topics['topic'] == 1]['by']
authors_with_more_topics = authors_number_different_topics[authors_number_different_topics['topic'] > 1]['by']

# Number of people with one / one + topics

In [167]:
authors_with_one_topic.shape

(2619,)

In [168]:
authors_with_more_topics.shape

(610,)

# Most popular single topic

In [164]:
unique_authors_topics[unique_authors_topics['by'].isin(authors_with_one_topic)]['topic'].value_counts()

People and ideas                   282
Github repositories                230
Deployments and databases          207
Compilers                          202
US Politics and administration     149
Wordpress                          147
Weird letters                      143
Literature, culture, philosophy    143
Linux                              127
Software                           125
JavaScript                         122
People words                       112
Machine Learning                   102
Weird words                         98
Hardware                            89
Medicine                            81
Government committees               70
Renewable energy                    69
Texts with codes                    66
Programming words                   55
Name: topic, dtype: int64

# Most popular combination topic

In [166]:
a = pd.crosstab(unique_authors_topics['by'], unique_authors_topics['topic'])

array(['People words', 'Linux', 'People and ideas',
       'Literature, culture, philosophy', 'Compilers',
       'US Politics and administration', 'Wordpress', 'Texts with codes',
       'Software', 'Renewable energy', 'JavaScript', 'Machine Learning',
       'Deployments and databases', 'Medicine', 'Weird words',
       'Government committees', 'Github repositories',
       'Programming words', 'Hardware', 'Weird letters'], dtype=object)

In [211]:
for col_1, col_2 in combinations(a.columns, 2):
    print(col_1, col_2)

Compilers Deployments and databases
Compilers Github repositories
Compilers Government committees
Compilers Hardware
Compilers JavaScript
Compilers Linux
Compilers Literature, culture, philosophy
Compilers Machine Learning
Compilers Medicine
Compilers People and ideas
Compilers People words
Compilers Programming words
Compilers Renewable energy
Compilers Software
Compilers Texts with codes
Compilers US Politics and administration
Compilers Weird letters
Compilers Weird words
Compilers Wordpress
Deployments and databases Github repositories
Deployments and databases Government committees
Deployments and databases Hardware
Deployments and databases JavaScript
Deployments and databases Linux
Deployments and databases Literature, culture, philosophy
Deployments and databases Machine Learning
Deployments and databases Medicine
Deployments and databases People and ideas
Deployments and databases People words
Deployments and databases Programming words
Deployments and databases Renewable ener

# Recommendations

In [209]:
for column in a.columns:
    res = a[a[column] == 1].drop([column], axis = 1).sum().sort_values().index[-1]
    print(f'{column} - {res}')

Compilers - People and ideas
Deployments and databases - People and ideas
Github repositories - People and ideas
Government committees - Wordpress
Hardware - Compilers
JavaScript - Wordpress
Linux - Wordpress
Literature, culture, philosophy - US Politics and administration
Machine Learning - People and ideas
Medicine - Literature, culture, philosophy
People and ideas - Compilers
People words - People and ideas
Programming words - US Politics and administration
Renewable energy - US Politics and administration
Software - US Politics and administration
Texts with codes - US Politics and administration
US Politics and administration - Literature, culture, philosophy
Weird letters - US Politics and administration
Weird words - US Politics and administration
Wordpress - People and ideas


In [179]:
unique_authors_topics[unique_authors_topics['by'].isin(authors_with_more_topics)]

Unnamed: 0,by,topic
0,todsacerdoti,People words
2,luu,People and ideas
3,pseudolus,"Literature, culture, philosophy"
5,beagle3,Compilers
6,sohkamyung,US Politics and administration
...,...,...
5889,seigando,Programming words
5893,Vigier,JavaScript
5899,ZnZirconium,Linux
5900,apsec112,Renewable energy


In [None]:
unique_authors_topics[unique_authors_topics['by'].isin(authors_with_more_topics)]['topic'].value_counts()

In [129]:
special_df = df.drop_duplicates(['topic', 'by'])

df_temp = special_df.groupby(['by'])['topic'].nunique()
authors_with_more_than_two_topics = (df_temp[df_temp > 1]).index.values
articles_with_authors_with_two_topics = special_df[special_df['by'].isin(authors_with_more_than_two_topics)]

In [130]:
from itertools import combinations

In [131]:
results = articles_with_authors_with_two_topics.groupby(['by'])['topic'].apply(lambda x: list(combinations(x, 2)))

In [132]:
all_bigrams_of_topics = [', '.join(el) for author in results.values for el in author]

In [138]:
plot_data = pd.Series(all_bigrams_of_topics).value_counts().head(10).reset_index()

In [139]:
plot_data

Unnamed: 0,index,0
0,"Compilers, People and ideas",29
1,"US Politics and administration, Weird letters",26
2,"People and ideas, Wordpress",26
3,"US Politics and administration, Literature, cu...",26
4,"Literature, culture, philosophy, Wordpress",25
5,"US Politics and administration, Software",23
6,"US Politics and administration, Wordpress",23
7,"US Politics and administration, People and ideas",22
8,"US Politics and administration, Medicine",22
9,"Linux, Wordpress",21


In [143]:
fig = px.bar(
    plot_data,
    x = 0,
    y = 'index',
    orientation='h'
)

fig.show()

In [44]:
plot_data = df.groupby([pd.Grouper(key = 'time', freq = 'M'), 'topic'])['score'].sum().reset_index()

fig = px.line(
    x = plot_data['time'],
    y = plot_data['score'],
    color = plot_data['topic'],
    title = 'Overall number of upvotes in time'
)

fig.show()

In [45]:
plot_data = df.groupby([pd.Grouper(key = 'time', freq = 'M'), 'topic'])['score'].mean().reset_index()

fig = px.line(
    x = plot_data['time'],
    y = plot_data['score'],
    color = plot_data['topic'],
    title = 'Average number of upvotes in time'
)

fig.show()