In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import json

%matplotlib inline

In [6]:
from sklearn.metrics import silhouette_score
from sklearn.cluster import KMeans, DBSCAN, hierarchical, AgglomerativeClustering
from sklearn.manifold import TSNE



# Load Data

In [2]:
root_path = ''

In [3]:
df_title = pd.read_csv(root_path + 'data/soc-redditHyperlinks-title.tsv', sep='\t')
df_body = pd.read_csv(root_path + 'data/soc-redditHyperlinks-body.tsv', sep='\t')
df_embeddings = pd.read_csv(root_path + 'data/web-redditEmbeddings-subreddits.csv', header=None)
df_title.TIMESTAMP = pd.to_datetime(df_title.TIMESTAMP)
df_body.TIMESTAMP = pd.to_datetime(df_body.TIMESTAMP)
df_all = pd.concat([df_title, df_body])
df_embeddings.rename({0:'sub'}, axis=1, inplace=True)

# Functions

In [4]:
def project_2d(df, perplex=25, n_iter=5000, random_state=42):
    tsne = TSNE(n_components=2, 
                verbose=1, 
                perplexity=perplex, 
                n_iter=n_iter, 
                n_jobs=-1, 
                random_state=random_state)
    tsne_results = tsne.fit_transform(df)
    df_subset = pd.DataFrame(tsne_results, columns=['x','y'])
    return df_subset

def reduce_subs(count):
    df_one = df_all.SOURCE_SUBREDDIT.value_counts()[:count]
    df_two = df_all.TARGET_SUBREDDIT.value_counts()[:count]
    keep_subs = set()
    keep_subs.update(df_one.index)
    keep_subs.update(df_two.index)
    data = df_embeddings[df_embeddings['sub'].map(lambda x: x in keep_subs)]
    return data

def plot_tsne(df):
    plt.figure(figsize=(16,10))
    sns.scatterplot(
    x='x', y='y',
    data=df,
    legend="full",
    alpha=0.3)

# Projections

In [18]:
perplex_sets = []
for i in range(1,6):
    data = reduce_subs(2500)
    embeddings = data.iloc[:,1:]
    tsne_result = project_2d(embeddings, i*10, n_iter=10_000)
    df_tsne = pd.DataFrame(tsne_result, columns=['x','y'])
    df_tsne['sub'] = data.reset_index()['sub']
    perplex_sets.append(df_tsne)

[t-SNE] Computing 31 nearest neighbors...
[t-SNE] Indexed 3339 samples in 0.231s...
[t-SNE] Computed neighbors for 3339 samples in 1.533s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3339
[t-SNE] Computed conditional probabilities for sample 2000 / 3339
[t-SNE] Computed conditional probabilities for sample 3000 / 3339
[t-SNE] Computed conditional probabilities for sample 3339 / 3339
[t-SNE] Mean sigma: 0.340466
[t-SNE] KL divergence after 250 iterations with early exaggeration: 86.331619
[t-SNE] KL divergence after 10000 iterations: 1.465147
[t-SNE] Computing 61 nearest neighbors...
[t-SNE] Indexed 3339 samples in 0.234s...
[t-SNE] Computed neighbors for 3339 samples in 1.457s...
[t-SNE] Computed conditional probabilities for sample 1000 / 3339
[t-SNE] Computed conditional probabilities for sample 2000 / 3339
[t-SNE] Computed conditional probabilities for sample 3000 / 3339
[t-SNE] Computed conditional probabilities for sample 3339 / 3339
[t-SNE] Mean sigma: 0.405631

In [27]:
fig = px.scatter(perplex_sets[0], x="x", y="y",
                hover_name="sub", hover_data=["sub"])
fig.show()

In [19]:
import os
os.system("printf '\a'")

0

In [20]:
import pickle

In [21]:
with open('tsne_coors.pkl', 'wb') as f:
    pickle.dump(perplex_sets, f)

## Filter Views

In [45]:
def display_plot(df):
    fig = px.scatter(df, x="x", y="y",
                    hover_name="sub", hover_data=["sub"])
    fig.show()
    
def filter_tag(tag_set, df):
    return df[df['sub'].map(lambda x: x in tag_set)]

In [32]:
with open('data/tag_filter.json', 'r') as f:
    tag_filter = json.load(f)
    

In [42]:
for k in tag_filter.keys():
    tag_filter[k] = set(tag_filter[k])

In [39]:
df_base = perplex_sets[0]

In [47]:
df_subset = filter_tag(tag_filter['Art'], df_base)

In [54]:
df_subset = filter_tag(tag_filter['General'], df_base)
display_plot(df_subset)

In [49]:
tag_filter.keys()

dict_keys(['General', 'Discussion', 'Gaming', 'Video Games', 'Counter-Strike', 'Destiny', 'Humor and Parody', 'News and Politics', 'Republican', 'Videos', 'League of Legends', 'Pictures', 'Personal Narratives', 'Lifestyle', 'Pets and Animals', 'Memes', 'Adult and NSFW', 'World News', 'Music', 'Politics', 'DoTA', 'Meta', 'Drugs', 'Marijuana', 'Informative', 'Technology', 'Hardware', 'Entertainment', 'Movies', 'Overwatch', 'Business', 'Hearthstone', 'Science', 'Sports', 'Wrestling', 'Basketball', 'Hip Hop', 'Soccer', 'FIFA', 'Mobile', 'iOS', 'WoW', 'Locations', 'World', 'India', 'RuneScape', 'Relationships', 'Pokemon', 'Hobbies and Interests', 'Writing', 'Fitness and Nutrition', 'Art', 'Food and Beverages', 'Health', 'Smoking and Tobacco', 'Football', 'PlayStation', 'Smite', 'Xbox', 'Dating', 'Personal Finance', 'Internet', 'Cryptocurrency', 'Travel', 'Shopping', 'Anime', 'Android', 'Super Smash Bros', 'Television', 'Cats', 'Depression and Anxiety', 'MMA', 'Fashion and Beauty', 'Accessor

In [83]:
def dataset_filter():
    pass

def gen_link_data(df):
    volume = df.groupby(['SOURCE_SUBREDDIT', 'TARGET_SUBREDDIT', 'LINK_SENTIMENT'], as_index=False).size().sort_values(ascending=False)
    
    # Links - source target
    # nodes - x y
    return nodes, links


def get_sent_split():
    pass

In [100]:
result = gen_link_data(df_all)

In [101]:
result = result.to_frame().reset_index().rename({0:'count'}, axis=1)

In [103]:
result.LINK_SENTIMENT.value_counts()

 1    321539
-1     41170
Name: LINK_SENTIMENT, dtype: int64