In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
plt.style.use('fivethirtyeight')

In [None]:
subreddits = pd.read_csv('../data/subreddits.csv')
submissions = pd.read_csv('../data/submissions.csv')

# Subreddits

In [None]:
subreddits.head()

In [None]:
subreddits['type'] = subreddits['table_number'].map({
    0:'Games and series',
    1:'Tabletop',
    2:'Nonspecific',
    3:'Genres',
    4:'Groups',
    5:'Platforms',
    6:'Companies',
})

In [None]:
df = (subreddits.groupby('type', as_index=False)
      .agg({'subscribers':'mean'})
      .sort_values('subscribers', ascending=False))
sns.barplot(data=df, x='type',y='subscribers')

In [None]:
subreddits['subscribers_thousands'] = subreddits['subscribers'] / 1000.0


# Submissions

In [None]:
submissions.head()

In [None]:
df = (submissions.groupby('subreddit', as_index=False)
 .agg({"score":"mean", 'num_comments':'mean'})
 .rename(columns={'score':'mean_score', 'num_comments':'mean_num_comments'})
 .merge(subreddits, left_on='subreddit', right_on='display_name')
 #.drop(columns=['Link','https_Link','title','label','table_number','display_name'])
 .sort_values('subscribers', ascending=False))
df['subscribers_thousands'] = df['subscribers'] / 1000.00

sns.scatterplot(data=df, x='subscribers_thousands', y='mean_num_comments', hue='type')
#sns.despine(left=True)
plt.show()

In [None]:
df = (df.groupby('type', as_index=False)
      .agg({'mean_score':'mean', 'mean_num_comments':'mean', 'subscribers_thousands':'mean'}))

sns.barplot(data=df.sort_values('mean_num_comments', ascending=False),
            x='type', y='mean_num_comments')

## Language analysis

In [None]:
import nltk

In [None]:
# Subreddit-specific Self-Text Corpus
df = (submissions
      .dropna(subset=['selftext'])
      .groupby('subreddit', as_index=False)
      .agg({'selftext':'sum'}))
df['corpus_len'] = df.selftext.apply(len)
df.sort_values('corpus_len', ascending=False, inplace=True)
subreddit_top10_corpus = df
subreddit_top10_corpus.head()

In [None]:
ax = subreddit_top10_corpus.set_index('subreddit').head(20).plot.bar()
ax.set_ylabel('corpus length')
ax.set_title('Corpus Length of Subreddits\' Top 10 submissions')

In [None]:
#nltk.download('punkt')
subreddit_top10_corpus['tokens'] = subreddit_top10_corpus['selftext'].apply(nltk.word_tokenize)
subreddit_top10_corpus[['subreddit','tokens']].head()

In [None]:
subreddit_top10_corpus['token_ct'] = subreddit_top10_corpus['tokens'].apply(len)
ax = (subreddit_top10_corpus.sort_values('corpus_len', ascending=False).head(20).set_index('subreddit')).plot.bar()
plt.show()

## Comments

In [None]:
comments = pd.read_csv('../data/comments.csv')

In [None]:
comments.columns

In [None]:
nltk.download('stopwords')

In [None]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
comments['body_tokens'] = comments['body'].apply(nltk.word_tokenize)

In [None]:
comments['body_tokens_filtered'] = comments['body_tokens'].apply(lambda tokens: [t for t in tokens if not t.lower() in stop_words])

In [None]:
comments['body_tokens_lower'] = comments['body_tokens'].apply(lambda tokens: [t.lower() for t in tokens])

In [None]:
comments['body_tokens_lower_filtered'] = comments['body_tokens_lower'].apply(lambda tokens: [t for t in tokens if not t in stop_words])

Issues:
1. punctuation
2. case
3. links
4. markdown syntax tokens

Entire comment corpus:

In [None]:
print(comments['body_tokens_filtered'].sum())

Total corpus size:

In [None]:
len(comments['body_tokens_lower_filtered'].sum())