In [None]:
import numpy as np
import pandas as pd
import re
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
from nltk.corpus import stopwords
from pymongo import MongoClient
from scipy.ndimage import gaussian_gradient_magnitude
from wordcloud import WordCloud, ImageColorGenerator

plt.style.use("seaborn-colorblind")
plt.rcParams["axes.axisbelow"] = True

In [None]:
conn = MongoClient("localhost", 27017)
db = conn.get_database("nl_words")
videos_coll = db.get_collection("videos")
trans_coll = db.get_collection("transcripts")
transcripts = trans_coll.find({})


In [None]:
## Main stuff here

main_df = pd.DataFrame(data=transcripts)
main_df["word_count_mean_by_year"] = main_df.groupby(main_df["date"].dt.year)["word_count"].transform("mean")
total_videos = len(main_df)
total_unique_dates = len(main_df["date"].unique())
total_transcripts = len(main_df[main_df["text"].notnull()])
total_words = main_df["word_count"].sum()

In [None]:
## Solo stuff here

exclude_string = "nlss|team unity|northernlion live super show|roundtable|mathas|blood bowl" \
                 "carry dan|champions of salt|multiplayer|geoguessr|with friends|factorio|" \
                 "tournament|and friends|and sinvicta|and ohmwrecker|with dangheesling|" \
                 "w/ dangheesling|rocket league|podcast|speed bowl|spy vs spy|tutty|teaam unity"

solo_df = main_df[~main_df["title"].str.contains(exclude_string, flags=re.IGNORECASE)]
solo_df["word_count_mean_by_year"] = solo_df.groupby(solo_df["date"].dt.year)["word_count"].transform("mean")
total_solo_videos = len(solo_df)
total_solo_unique_dates = len(solo_df["date"].unique())
total_solo_transcripts = len(solo_df[main_df["text"].notnull()])
total_solo_words = solo_df["word_count"].sum()

In [None]:
## Isaac stuff here

isaac_df = main_df[main_df["title"].str.contains("binding of isaac", flags=re.IGNORECASE)]
isaac_df["word_count_mean_by_year"] = isaac_df.groupby(isaac_df["date"].dt.year)["word_count"].transform("mean")
total_isaac_videos = len(isaac_df)
total_isaac_unique_dates = len(isaac_df["date"].unique())
total_isaac_transcripts = len(isaac_df[main_df["text"].notnull()])
total_isaac_words = isaac_df["word_count"].sum()

Date Range: 09/22/2010 to 02/20/2020

Channel Totals:

* Total videos: 11,087
* Unique Dates: 11,032
* Videos w/ Transcripts: 8,713
* Total Words: 51,718,622

Summary:

|---|Word Count (per video)|
|---|---|
|Minimum|32|
|25%|3,749|
|Median|5,104|
|Mean|5,935.80|
|75%|6,639|
|Maximum|34,845|
|Std|4,445.04|

Solo Totals

* Total videos: 8,624
* Unique Dates: 8,595
* Videos w/ Transcripts: 7,127
* Total Words: 38,087,400

Summary:

|---|Word Count (per video)|
|---|---|
|Minimum|32|
|25%|3,979.50|
|Median|5,258|
|Mean|5,344.10|
|75%|6,614|
|Maximum|22,603|
|Std|2,187.43|

*Because of course...*

Isaac Totals

* Total videos: 3,431
* Unique Dates: 3,431
* Videos w/ Transcripts: 2,922
* Total Words: 19,059,293

Summary:

|---|Word Count (per video)|
|---|---|
|Minimum|1,689|
|25%|5,330|
|Median|6,289|
|Mean|6,522.69|
|75%|7,494|
|Maximum|16,154|
|Std|1,724.30|

Word Distribution (using solo transcripts)

* Total Words (before stopwords): 38,087,400
* Total Words (after stopwords): 11,413,612
* Unique Words (before stopwords): 138,290
* Unique Words (after stopwords): 106,156

Top 20 Words:



In [None]:
bins = np.linspace(0, 36000, 50)
plt.figure(figsize=(11, 11))
plt.grid(alpha=0.5)
plt.hist(main_df["word_count"], bins, alpha=0.75, edgecolor="black", label="all")
plt.hist(solo_df["word_count"], bins, alpha=0.75, edgecolor="black", label="solo")
plt.hist(isaac_df["word_count"], bins, alpha=0.75, edgecolor="black", label="isaac")
plt.legend(loc="upper right", prop={"size": 15})
plt.title("Comparative Histogram: Word Counts of All Vids, Solo Vids, Isaac Vids", fontsize=16, pad=10)
plt.tick_params(labelsize=12)
plt.ylabel("Occurrences", fontsize=14, labelpad=20)
plt.xlabel("Word Count", fontsize=14, labelpad=20)
plt.savefig("histogram.pdf", bbox_inches="tight")

In [None]:
main_time_df = main_df.groupby(main_df["date"].dt.year).first()
solo_time_df = solo_df.groupby(solo_df["date"].dt.year).first()
isaac_time_df = isaac_df.groupby(isaac_df["date"].dt.year).first()

plt.figure(figsize=(11, 11))
plt.grid(alpha=0.5)
plt.plot_date(main_time_df["date"], main_time_df["word_count_mean_by_year"], marker="^", linestyle="-", alpha=0.75, label="channel")
plt.plot_date(solo_time_df["date"], solo_time_df["word_count_mean_by_year"], marker="*", linestyle="-", alpha=0.75, label="solo")
plt.plot_date(isaac_time_df["date"], isaac_time_df["word_count_mean_by_year"], marker=".", linestyle="-", alpha=0.75, label="isaac")
plt.legend(loc="upper right", prop={"size": 15})
plt.title("Mean Word Counts by Year", fontsize=16, pad=10)
plt.tick_params(labelsize=12)
plt.ylabel("Word Count", fontsize=14, labelpad=20)
plt.xlabel("Year", fontsize=14, labelpad=20)
plt.savefig("word_counts_by_year.pdf", bbox_inches="tight")

In [None]:
theta = [0, np.pi/2, np.pi, 3 * np.pi/2]
radii = [100, total_solo_words/total_words * 100, total_isaac_words/total_words * 100, (total_words - total_solo_words)/total_words * 100]
width = np.pi/2
fig = plt.figure(figsize=(11, 11))
ax = plt.subplot(111, projection='polar')
ax.bar(theta, radii, width=width, color=sns.color_palette("colorblind", 4), alpha=0.75)
ax.set_ylim(top=100)
ax.get_ygridlines()[4].set_linewidth(2)
ax.get_ygridlines()[4].set_color("black")
ax.spines["polar"].set_visible(False)
ax.set_title("Proportion Comparison of Word Counts", fontsize=16, pad=10)
ax.set_xticklabels([])
ax.set_rticks([20, 40, 60, 80, 100])
ax.set_yticklabels([])
ax.set_rlabel_position(20)
ax.text(np.pi/8, 8, "20%", color="white", fontsize=12)
ax.text(np.pi/8, 25, "40%", color="white", fontsize=12)
ax.text(np.pi/8, 45, "60%", color="white", fontsize=12)
ax.text(np.pi/8, 65, "80%", color="white", fontsize=12)
ax.text(np.pi/8, 85, "100%", color="white", fontsize=12)
channel_patch = mpatches.Patch(color=sns.color_palette("colorblind", 4)[0], label="channel")
solo_patch = mpatches.Patch(color=sns.color_palette("colorblind", 4)[1], label="solo")
isaac_patch = mpatches.Patch(color=sns.color_palette("colorblind", 4)[2], label="isaac")
collab_patch = mpatches.Patch(color=sns.color_palette("colorblind", 4)[3], label="collab")
ax.legend(handles=[channel_patch, solo_patch, isaac_patch, collab_patch], loc="upper right", bbox_to_anchor=(1.07, 1.06), prop={"size": 15})
fig.savefig("proportion_comparison.pdf", bbox_inches="tight")

In [None]:
stop_words = set(stopwords.words("english"))

with open("Stop_Words.txt", "r") as file:
    custom_stop_words = set(file.read().splitlines())
    
stop_words = stop_words.union(custom_stop_words)
stop_words.add("gonna")

solo_word_dist = {}

for i, row in solo_df.iterrows():
    if row.get("words"):
        for word in row["words"]:
            word = word.lower()
            if solo_word_dist.get(word, None):
                solo_word_dist[word] += 1
            else:
                solo_word_dist[word] = 1
                
sorted_solo_word_dist = {k: v for k, v in sorted(solo_word_dist.items(), key=lambda x: x[1], reverse=True)}
solo_initial_num_words = len(list(solo_word_dist.keys()))

for word in stop_words:
    if word in sorted_solo_word_dist:
        sorted_solo_word_dist.pop(word)
        
solo_final_num_words = len(list(sorted_solo_word_dist.keys()))
total_solo_final_words = sum(list(sorted_solo_word_dist.values()))

In [None]:
print(solo_initial_num_words)
print(solo_final_num_words)
print(total_solo_final_words)

In [None]:
sorted_solo_word_dist

In [None]:

fig, ax = plt.subplots(figsize=(11, 11))
ax.bar(range(20), list(sorted_solo_word_dist.values())[:20], align="center", color=sns.color_palette("colorblind", 5), 
       alpha=0.75)
ax.grid(alpha=0.5, axis="y")
ax.set_xticks(range(20))
ax.set_yticklabels([str(x) for x in range(0, 160000, 20000)], fontsize=14)
ax.set_xticklabels(list(sorted_solo_word_dist.keys())[:20], rotation="vertical", fontsize=14)
ax.set_title("Solo Videos Top 20 Words", fontsize=16, pad=10)
for i, value in enumerate(list(sorted_solo_word_dist.values())[:20]):
    ax.text(i, 3000, f"{value:,}", color="white", rotation="vertical", horizontalalignment="center", fontweight="bold", 
            fontsize=12)
ax.yaxis.get_label().set_fontsize(14)
fig.savefig("solo_top_20.pdf", bbox_inches="tight")



In [None]:
logo_color = np.array(Image.open("logo.jpg"))
logo_mask = logo_color.copy()
edges = np.mean([gaussian_gradient_magnitude(logo_color[:, :, i] / 50., 3) for i in range(3)], axis=0)
logo_mask[edges > .09] = 0

wordcloud = WordCloud(max_words=8000, mask=logo_mask, max_font_size=30, random_state=40, relative_scaling=0).generate_from_frequencies(frequencies=sorted_solo_word_dist)
wordcloud_colors = ImageColorGenerator(logo_color)
wordcloud.recolor(color_func=wordcloud_colors)
plt.figure(figsize=(9, 9))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.savefig("wordcloud.png", bbox_inches="tight", pad_inches=0)