In [254]:
import warnings
warnings.filterwarnings('ignore')

from itertools import combinations

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

In [2]:
articles = pd.read_csv('../data/articles.csv')
df = pd.read_csv('../data/articles-with-topics.csv')

In [3]:
articles['time'] = pd.to_datetime(articles['time'])
df['time'] = pd.to_datetime(df['time'])

In [4]:
df.head()

Unnamed: 0,by,descendants,id,kids,score,time,title,type,url,topic
0,todsacerdoti,607,24563698,24563821 24563987 24565829 24564488 24563843 2...,711,2020-09-23 08:38:33,Firefox usage is down 85% despite Mozilla's to...,story,http://calpaterson.com/mozilla.html,People words
1,themodelplumber,126,24562850,24563705 24563838 24563276 24563478 24565992 2...,310,2020-09-23 06:04:45,"How to say no, for the people pleaser who alwa...",story,https://www.npr.org/2020/09/15/913207918/how-t...,Linux
2,luu,27,24553085,24563230 24562887 24562049 24564895 24562235 2...,227,2020-09-22 13:04:22,Small Tech,story,https://scattered-thoughts.net/writing/small-t...,People and ideas
3,pseudolus,56,24553369,24563122 24565813 24562283 24562375 24563752 2...,94,2020-09-22 13:47:02,You Talkin’ to Me? The Unruly History of New Y...,story,https://literaryreview.co.uk/tawk-of-the-town,"Literature, culture, philosophy"
4,jeanyang,24,24561119,24561495 24562538 24565366 24564334 24561573 2...,87,2020-09-23 01:13:58,Catch Breaking Changes by Diffing API Traffic,story,https://www.akitasoftware.com/blog/2020/9/22/f...,People and ideas


# Number of articles in time

In [57]:
plot_data_articles = articles.groupby([pd.Grouper(key = 'time', freq = 'M')])['score'].count().reset_index()
plot_data_articles_topic = df.groupby([pd.Grouper(key = 'time', freq = 'M')])['score'].count().reset_index()

fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x = plot_data_articles['time'],
        y = plot_data_articles['score'],
        name = 'Articles'
    )
)

fig.add_trace(
    go.Scatter(
        x = plot_data_articles_topic['time'],
        y = plot_data_articles_topic['score'],
        name = 'Articles with content'
    )
)

fig.update_layout(
    title = 'Number of articles in time'
)

fig.write_image('results/plots-articles-with-topics/articles-in-time.png')

# Number of articles per topic

In [56]:
plot_data = df['topic'].value_counts().reset_index()

fig = px.bar(
    plot_data,
    x = 'index',
    y = 'topic',
    title = 'Number of articles per topic'
)

fig.update_layout(
    xaxis = {
        'title': 'Topic',
        'categoryorder': 'total descending'
    },
    yaxis = {
        'title': 'Number of articles'
    }
)

fig.write_image('results/plots-articles-with-topics/article-number-topic.png')

# Avg. number of upvotes by topic

In [37]:
plot_data = df.groupby(['topic'])['score'].mean().reset_index()

fig = px.bar(
    plot_data,
    x = 'topic',
    y = 'score',
    title = 'Average number of upvotes by topic'
)

fig.update_layout(
    xaxis = {
        'title': 'Topic',
        'categoryorder': 'total descending'
    },
    yaxis = {
        'title': 'Average number of upvotes'
    }
)

fig.write_image('results/plots-articles-with-topics/avg-upvotes.png')

# Avg. number of comments by topic

In [263]:
plot_data = df.groupby(['topic'])['descendants'].mean().reset_index()

fig = px.bar(
    plot_data,
    x = 'topic',
    y = 'descendants',
    title = 'Average number of comments by topic'
)

fig.update_layout(
    xaxis = {
        'title': 'Topic',
        'categoryorder': 'total descending'
    },
    yaxis = {
        'title': 'Average number of comments'
    }
)

fig.write_image('results/plots-articles-with-topics/avg-comments.png')

# Number of people with one / one + topics

In [147]:
unique_authors_topics = df[['by', 'topic']].drop_duplicates()

In [148]:
unique_authors_topics

Unnamed: 0,by,topic
0,todsacerdoti,People words
1,themodelplumber,Linux
2,luu,People and ideas
3,pseudolus,"Literature, culture, philosophy"
4,jeanyang,People and ideas
...,...,...
5904,palo3,Linux
5905,cube00,Compilers
5907,laurex,Software
5909,herval,JavaScript


In [151]:
authors_number_different_topics = unique_authors_topics.groupby(['by'])['topic'].nunique().reset_index()

In [155]:
authors_with_one_topic = authors_number_different_topics[authors_number_different_topics['topic'] == 1]['by']
authors_with_more_topics = authors_number_different_topics[authors_number_different_topics['topic'] > 1]['by']

In [167]:
authors_with_one_topic.shape

(2619,)

In [168]:
authors_with_more_topics.shape

(610,)

# Most popular single topic

In [164]:
unique_authors_topics[unique_authors_topics['by'].isin(authors_with_one_topic)]['topic'].value_counts()

People and ideas                   282
Github repositories                230
Deployments and databases          207
Compilers                          202
US Politics and administration     149
Wordpress                          147
Weird letters                      143
Literature, culture, philosophy    143
Linux                              127
Software                           125
JavaScript                         122
People words                       112
Machine Learning                   102
Weird words                         98
Hardware                            89
Medicine                            81
Government committees               70
Renewable energy                    69
Texts with codes                    66
Programming words                   55
Name: topic, dtype: int64

# Most popular combination topic

In [242]:
a = pd.crosstab(unique_authors_topics['by'], unique_authors_topics['topic'])

In [243]:
results = [(col_1, col_2, np.logical_and(a[col_1] == 1, a[col_2] == 1).sum()) 
           for col_1, col_2 in combinations(a.columns, 2)]

results = pd.DataFrame(results, columns=['Topic 1 ', 'Topic 2', 'Users'])

In [244]:
results.sort_values('Users').tail(3)

Unnamed: 0,Topic 1,Topic 2,Users
153,People and ideas,Wordpress,43
120,"Literature, culture, philosophy",US Politics and administration,47
9,Compilers,People and ideas,48


# Recommendations

In [209]:
for column in a.columns:
    res = a[a[column] == 1].drop([column], axis = 1).sum().sort_values().index[-1]
    print(f'{column} - {res}')

Compilers - People and ideas
Deployments and databases - People and ideas
Github repositories - People and ideas
Government committees - Wordpress
Hardware - Compilers
JavaScript - Wordpress
Linux - Wordpress
Literature, culture, philosophy - US Politics and administration
Machine Learning - People and ideas
Medicine - Literature, culture, philosophy
People and ideas - Compilers
People words - People and ideas
Programming words - US Politics and administration
Renewable energy - US Politics and administration
Software - US Politics and administration
Texts with codes - US Politics and administration
US Politics and administration - Literature, culture, philosophy
Weird letters - US Politics and administration
Weird words - US Politics and administration
Wordpress - People and ideas


# Average length of article

In [247]:
content = pd.read_csv('../data/content-clean.csv')

content_with_articles = pd.merge(content, df, on='url', how='inner')
content_with_articles['article_length'] = content_with_articles.text.str.len()

In [251]:
data_plot = content_with_articles.groupby('topic').article_length.mean().round(2).reset_index().sort_values(by='article_length')

In [261]:
fig = px.bar(
    data_plot,
    x = 'article_length',
    y = 'topic',
    orientation = 'h',
    title = 'Average length of article'
)

fig.update_layout(
    xaxis = {
        'title': 'Average length of article (char)',
        'categoryorder': 'total descending'
    },
    yaxis = {
        'title': 'Topic'
    }
)

fig.write_image('results/plots-articles-with-topics/avg-length.png')