# Imports

In [None]:
import sys
sys.path.append('../')

import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans, AgglomerativeClustering, FeatureAgglomeration
from scipy.spatial import distance
import matplotlib.pyplot as plt
from embedding_functions_hugo.embedding_functions import *

# Data Grabbing

In [None]:
# subreddit dataframes with comments
df_gaming = pd.read_csv('../data/scrapes/gaming.csv')
df_satis = pd.read_csv('../data/scrapes/SatisfactoryGame.csv')
df_marauders = pd.read_csv('../data/scrapes/MaraudersGame.csv')
df_tarkov = pd.read_csv('../data/scrapes/EscapefromTarkov.csv')
df_politics = pd.read_csv('../data/scrapes/politics.csv')
df_antiwork = pd.read_csv('../data/date_folders/april_17/scrapes/antiwork.csv')

# loading embeddings
politics_embeddings = np.load('../data/embeddings/politics_embeddings.npy')
gaming_embeddings = np.load('../data/embeddings/gaming_embeddings.npy')
marauders_embeddings = np.load('../data/embeddings/marauders_embeddings.npy')
tarkov_embeddings = np.load('../data/embeddings/tarkov_embeddings.npy')
satisfactory_embeddings = np.load('../data/embeddings/satisfactory_embeddings.npy')

# loading large embeddings
politics_embeddings_large = np.load('../data/big_embeddings/politics.npy')
gaming_embeddings_large = np.load('../data/big_embeddings/gaming.npy')
tarkov_embeddings_large = np.load('../data/big_embeddings/Tarkov.npy')
marauders_embeddings_large = np.load('../data/big_embeddings/Marauders.npy')
satisfactory_embeddings_large = np.load('../data/big_embeddings/Satisfactory.npy')

# Functions

In [None]:
# shortening and cleaning function
def shorten_and_clean_dataset (comment_csv, comment_column:str, desired_comment_length:int):
    dataframe = pd.read_csv(comment_csv)
    dataframe['cleaned_text'] = prep_pipeline(dataframe, comment_column)
    dataframe['short'] = shorten_sens(dataframe['cleaned_text'], desired_comment_length)
    return dataframe

# function for creating and saving embeddings
def save_embeddings_as_npy(destination_path:str, comment_csv, comment_column:str, desired_comment_length:int):
    '''
    Nlp pipeline function which takes a pandas dataframe and relevant columns, performs preprocessing steps, uses sentence_transformer embeddings and saves the embeddings as a csv file.
    '''
    sentences = shorten_and_clean_dataset(comment_csv, comment_column, desired_comment_length)
    embeddings = embed_comments(sentences['short'])
    return np.save(destination_path, embeddings)

def pair_users_embeddings(dataframe, embeddings, average_out_comments = False):
    usernames = dataframe['comment_author']
    user_dictionary = {}
    for author, embedded_comment in zip(usernames, embeddings):
        if author not in user_dictionary.keys():
            user_dictionary[author] = []
            user_dictionary[author].append(embedded_comment)
        else:
            user_dictionary[author].append(embedded_comment)
    if average_out_comments:
        for user in user_dictionary:
            number_or_comments = len(user_dictionary[user])
            user_dictionary[user] = sum(user_dictionary[user])/number_or_comments
    return user_dictionary

# Other prep

## Adding Cleaned Comments

In [None]:
df_politics['cleaned_text'] = prep_pipeline(df_politics, 'comment_text', loud=False)
df_politics['short'] = shorten_sens(df_politics['cleaned_text'], 50)

df_gaming['cleaned_text'] = prep_pipeline(df_gaming, 'comment_text', loud=False)
df_gaming['short'] = shorten_sens(df_gaming['cleaned_text'], 50)

df_tarkov['cleaned_text'] = prep_pipeline(df_tarkov, 'comment_text', loud=False)
df_tarkov['short'] = shorten_sens(df_tarkov['cleaned_text'], 50)

df_marauders['cleaned_text'] = prep_pipeline(df_marauders, 'comment_text', loud=False)
df_marauders['short'] = shorten_sens(df_marauders['cleaned_text'], 50)

df_satis['cleaned_text'] = prep_pipeline(df_satis, 'comment_text', loud=False)
df_satis['short'] = shorten_sens(df_satis['cleaned_text'], 50)

df_antiwork['cleaned_text'] = prep_pipeline(df_antiwork, 'comment_text', loud=False)
df_antiwork['short'] = shorten_sens(df_antiwork['cleaned_text'], 50)

# Specific inspection

### Antiwork example where user posts something with broad negativity

In [None]:
antiwork_embeddings = save_embeddings_as_npy('../data/embeddings/',
                                             '../data/date_folders/april_17/scrapes/antiwork.csv',
                                             'comment_text',
                                             50)
antiwork_user_embeddings = pair_users_embeddings(df_antiwork, antiwork_embeddings, True)

In [None]:
print(df_antiwork.columns)

In [None]:
print(df_antiwork['post_title'].unique())

In [None]:
for idx, row in df_antiwork.iterrows():
    if row['post_title'] == 'Republicans are not pro life. And they need to be stopped immediately. They’re anti humanity':
        # print(row['post_title'])
        print(row['post_url'])
        # if row['comment_author'] == 'Southern_Nature_5416':
        print('========================================================')
        # print(row['post_title'])
        print(row['comment_author'])
        print(row['text_clean'])

        from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
        def sentiment_vader(sentence):
            # Create a SentimentIntensityAnalyzer object.
            sid_obj = SentimentIntensityAnalyzer()

            sentiment_dict = sid_obj.polarity_scores(sentence)
            compound = sentiment_dict['compound']
            
            return compound
        print(sentiment_vader(row['text_clean']))

### Strong Antiwork Post
"Republicans are not pro life. And they need to be stopped immediately. They’re anti humanity"

### Strong comment left on post (by LadyMageCOH, sentiment -0.9921):

republicans never have been prolife  one of the carolinas is trying to pass or may have already passed a bill that classes abortion as felony murder  felony murder in the same state is potentially punishable by the death penalty  

so if a woman is raped ends up pregnant and aborts the pregnancy could end up getting more time than her rapist and possibly get executed  

even more appalling  many of these abortion bans that have come into effect since the dobbs decision are worded so poorly that they could get a woman arrested for a spontaneous abortion aka miscarriage

### Shorter strong comment left on post (by Southern_Nature_5416, sentiment -0.7783)

texas wants to be like australia but instead of animals trying to kill you it is republicans

# Comment Inspection

## Quick Prep (r/politics)

In [None]:
politics_user_embeddings = pair_users_embeddings(df_politics, politics_embeddings, True)

# Set PCA to desired number of dimensions
pca = PCA(n_components=2)

pca_embeddings = pca.fit_transform(list(politics_user_embeddings.values()))

kmeans = KMeans(n_clusters=2, random_state=0)

classes = kmeans.fit_predict(pca_embeddings)

## Horizontally distant

In [None]:
#print(pca_embeddings.shape)
#print(len(politics_user_embeddings.keys()), len(politics_user_embeddings.values()))

# finding indexes of rows with least and max x values

x_vals = []
for idx, row in enumerate(pca_embeddings):
    x_val = row[0]
    x_vals.append(x_val)

# least x
least_x = min(x_vals)
least_x_index = np.argmin(x_vals)
least_x_username = list(politics_user_embeddings.keys())[least_x_index]
least_x_comments = df_politics.loc[df_politics['comment_author'] == least_x_username]

max_x = max(x_vals)
max_x_index = np.argmax(x_vals)
max_x_username = list(politics_user_embeddings.keys())[max_x_index]
max_x_comments = df_politics.loc[df_politics['comment_author'] == max_x_username]

#print(least_x, least_x_index, least_x_username)
#print(max_x, max_x_index, max_x_username)

#print(df_politics.shape)
#print(len(politics_user_embeddings.keys()))
print('===== Lowest x coord comment =====')
print(least_x_comments['comment_text'].values[0])
print('\n===== Highest x coord comment =====')
print(max_x_comments['comment_text'].values[0])

## Vertically distant

In [None]:
#print(pca_embeddings.shape)
#print(len(politics_user_embeddings.keys()), len(politics_user_embeddings.values()))

# finding indexes of rows with least and max y values

y_vals = []
for idx, row in enumerate(pca_embeddings):
    y_val = row[1]
    y_vals.append(y_val)

# least y
least_y = min(y_vals)
least_y_index = np.argmin(y_vals)
least_y_username = list(politics_user_embeddings.keys())[least_y_index]
least_y_comments = df_politics.loc[df_politics['comment_author'] == least_y_username]

max_y = max(y_vals)
max_y_index = np.argmax(y_vals)
max_y_username = list(politics_user_embeddings.keys())[max_y_index]
max_y_comments = df_politics.loc[df_politics['comment_author'] == max_y_username]

#print(least_y, least_y_index, least_y_username)
#print(max_y, max_y_index, max_y_username)

#print(df_politics.shape)
#print(len(politics_user_embeddings.keys()))
print('===== Lowest y coord comment =====')
print(least_y_comments['comment_text'].values[0])
print()
print('===== Highest y coord comment =====')
print(max_y_comments['comment_text'].values[0])

## Most distant (in progress)

In [None]:
def most_distant(df, embeddings, n=3, large_embeds=True):
    '''
    inputs:
     - df: df to work with
     - embeddings: embeddings to work with
     - n: number of most distant user-pairs to print
    
    Function calculates the distance between all user pairs and sorts them from most to least distant.
    n number of user-pairs have their comments printed
    '''
    
    ### TODO DONT USE pair_users_embeddings
    # # pairing embeddings
    # user_embeddings = pair_users_embeddings(df, embeddings, False)
    
    # # doing pca things
    # pca = PCA(n_components=2)
    # pca_embeddings = pca.fit_transform(list(user_embeddings.values()))

    # looping through dataframe and embeddings to get username list that matches index-wise
    # for embeddings that include every comment (duplicate authors):
    if large_embeds == False:
        username_list = list(df['comment_author'])
    # for embeddings that have averaged authors (no duplicate authors):
    else:
        username_list = list()
        for index, row in df.iterrows():
            username = row['comment_author']
            if username not in username_list:
                username_list.append(username)
    
    # getting user pairs
    pair_dict = dict()
    for i, j in zip(username_list, embeddings):
        pair_dict[i] = j
    
    # reducing embeddings to 2 dimensions
    pca = PCA(n_components=2)
    pca_embeddings = pca.fit_transform(list(pair_dict.values()))

    # classifying using kmeans and printing blob
    kmeans = KMeans(n_clusters=2, random_state=0)
    classes = kmeans.fit_predict(pca_embeddings)
    label_color_map = {0 : 'r',1 : 'g'}
    label_color = [label_color_map[l] for l in classes]
    plt.scatter(pca_embeddings[:,0], pca_embeddings[:,1], c=label_color)
    
    distance_list = list() # list for keeping track of maximum distances, index matches the user the distance is from
    distance_to_whom_list = list() # list for keeping track of the index of the user the distance is to
    
    num_rows = pca_embeddings.shape[0]
    print(f'looping through {num_rows} rows, this may take a few mins...')
    
    # looping through embeddings to loop through users
    for idx_a, row_a in enumerate(pca_embeddings):
        x_val_a = row_a[0]
        y_val_a = row_a[1]
        
        max_distance = 0.0
        to_whom_index = 0
        for idx_b, row_b in enumerate(pca_embeddings):
            x_val_b = row_b[0]
            y_val_b = row_b[1]
            
            # calculating euclidean distance
            a = (x_val_a, y_val_a)
            b = (x_val_b, y_val_b)
            dist = distance.euclidean(a, b)
            
            # saving to distance_list and distance_to_whom_list
            if dist > max_distance:
                max_distance = dist
                to_whom_index = idx_b
        
        distance_list.append(max_distance)
        distance_to_whom_list.append(to_whom_index)
    
    # sorting distance_list
    top_n_distances = np.sort(distance_list)
    top_n_indexes = np.argsort(distance_list)[-n:]
    
    #print(top_n_distances)
    #print(top_n_indexes)
    
    n_pairs = dict()
    for a_idx in top_n_indexes:
        n_pairs[a_idx] = [distance_to_whom_list[a_idx], top_n_distances[a_idx]]
    
    #print(n_pairs)
    
    # gettings comments of pairs and printing them
    usernames_a = list()
    for index in n_pairs.keys():
        username = username_list[index]
        usernames_a.append(username)
    
    usernames_b = list()
    for index in list(n_pairs.values()):
        username = username_list[index[0]]
        usernames_b.append(username)
    
    for pair in zip(usernames_a, usernames_b):
        username_a = pair[0]
        username_b = pair[1]
        
        comments_a = df.loc[df['comment_author'] == username_a]
        comments_b = df.loc[df['comment_author'] == username_b]
        
        print(f'\n===== Pair between {username_a} and {username_b} =====')
        print(f' == {username_a}:\n{comments_a["short"].values[0]}')
        print(f' == {username_b}:\n{comments_b["short"].values[0]}')
    
    return

In [None]:
print(df_satis.loc[df_satis['comment_author'] == 'Hob_O_Rarison']['comment_text'].values[0])
print(df_satis.loc[df_satis['comment_author'] == 'Hob_O_Rarison']['short'].values[0])

In [None]:
# pairing embeddings
user_embeddings = pair_users_embeddings(df_satis, satisfactory_embeddings, True)

# doing pca things
pca = PCA(n_components=2)
pca_embeddings = pca.fit_transform(list(user_embeddings.values()))
kmeans = KMeans(n_clusters=2, random_state=0)
classes = kmeans.fit_predict(pca_embeddings)

print(pca_embeddings[456])
print(pca_embeddings[1435])
print(pca_embeddings[1425])
print(pca_embeddings[2599])
print(pca_embeddings[2322])
print(pca_embeddings[2429])

print(list(user_embeddings.keys())[1435])
print(df_satis.loc[df_satis['comment_author'] == list(user_embeddings.keys())[1435]]['short'].values[0])
print(list(user_embeddings.keys())[1425])
print(df_satis.loc[df_satis['comment_author'] == list(user_embeddings.keys())[1425]]['short'].values[0])
print(list(user_embeddings.keys())[2599])
print(df_satis.loc[df_satis['comment_author'] == list(user_embeddings.keys())[2599]]['short'].values[0])
print(list(user_embeddings.keys())[2322])
print(df_satis.loc[df_satis['comment_author'] == list(user_embeddings.keys())[2322]]['short'].values[0])
print(list(user_embeddings.keys())[2429])
print(df_satis.loc[df_satis['comment_author'] == list(user_embeddings.keys())[2429]]['short'].values[0])

In [None]:
most_distant(df_satis, satisfactory_embeddings_large, n=20, large_embeds=True)

In [None]:
most_distant(df_politics, politics_embeddings_large, n=20)

In [None]:
most_distant(df_gaming, gaming_embeddings_large, n=20)

In [None]:
most_distant(df_tarkov, tarkov_embeddings_large, n=20)

In [None]:
most_distant(df_marauders, marauders_embeddings_large, n=20)

## Similar Comments

In [None]:
def find_similar(df, embeddings):
    '''
    inputs:
     - df: df to work with
     - embeddings: embeddings to work with

    function finds similar groups of users in different areas of the embeddings space and prints one of their comments:
     - MIDDLE: All users around 0 (+/-0.1) are grouped, the amount of users here can vary
     - LEFT, RIGHT, TOP, BOTTOM: For each of these sides 5 users that are the most of them are grouped
    '''
    
    # pairing embeddings
    user_embeddings = pair_users_embeddings(df, embeddings, True)
    
    # doing pca things
    pca = PCA(n_components=2)
    pca_embeddings = pca.fit_transform(list(user_embeddings.values()))
    kmeans = KMeans(n_clusters=2, random_state=0)
    classes = kmeans.fit_predict(pca_embeddings)
    
    # print blob
    label_color_map = {0 : 'r',1 : 'g'}
    label_color = [label_color_map[l] for l in classes]
    plt.scatter(pca_embeddings[:,0], pca_embeddings[:,1], c=label_color)
    
    # finding similar things
    # NEED TO CODE FOR FINDING ALL FAR THINGS
    to_check = ['MIDDLE', 'LEFT', 'RIGHT', 'TOP', 'BOTTOM']
    
    for i in to_check:
        
        # finding x and y limits based off of blob
        
        if i == 'MIDDLE':
            print('========== MIDDLE ==========')
            x_lims = [-0.1, 0.1]
            y_lims = [-0.1, 0.1]
            
            similar_indexes = list()
            for idx, row in enumerate(pca_embeddings):
                x_val = row[0]
                y_val = row[1]
                
                if x_val > x_lims[0] and x_val < x_lims[1] and y_val > y_lims[0] and y_val < y_lims[1]:
                    similar_indexes.append(idx)
        
        # checks from far left and finds first 5 comments
        elif i == 'LEFT':
            print('========== LEFT ==========')
            # get list of x coords for sorting
            x_coords = list()
            for idx, row in enumerate(pca_embeddings):
                x_coords.append(row[0])
            
            # sorts x coords by ascending, but gives the indexes not the values
            sorted_indexes = np.argsort(x_coords)
            
            similar_indexes = sorted_indexes[:5]
        
        # checks from far left and finds first 5 comments
        elif i == 'RIGHT':
            print('========== RIGHT ==========')
            # get list of x coords for sorting
            x_coords = list()
            for idx, row in enumerate(pca_embeddings):
                x_coords.append(row[0])
            
            # sorts x coords by descending, but gives the indexes not the values
            initial_sort = np.argsort(x_coords)
            
            similar_indexes = initial_sort[::-1][:5] # 5 for first 5 comments
        
        # checks from far top and finds first 5 comments
        elif i == 'TOP':
            print('========== TOP ==========')
            # get list of y coords for sorting
            y_coords = list()
            for idx, row in enumerate(pca_embeddings):
                y_coords.append(row[1])
            
            # sorts y coords by descending, but gives the indexes not the values
            initial_sort = np.argsort(y_coords)
            
            similar_indexes = initial_sort[::-1][:5] # 5 for first 5 comments
        
        elif i == 'BOTTOM':
            print('========== BOTTOM ==========')
            # get list of y coords for sorting
            y_coords = list()
            for idx, row in enumerate(pca_embeddings):
                y_coords.append(row[1])
            
            # sorts y coords by ascending, but gives the indexes not the values
            sorted_indexes = np.argsort(y_coords)
            
            similar_indexes = sorted_indexes[:5]
            
        # using list of similar indexes, matches with users and prints their comments
        usernames = list()
        for index in similar_indexes:
            username = list(user_embeddings.keys())[index]
            usernames.append(username)

        # cleaning comments to get relevant ones in embedding space
        df['cleaned_text'] = prep_pipeline(df, 'comment_text', loud=False)
        df['short'] = shorten_sens(df['cleaned_text'], 50)
        
        for username in usernames:
            comments = df.loc[df['comment_author'] == username]
            #print(comments['comment_text'].values[0], '\n')
            print(f'{username}:')
            print(comments['short'].values[0], '\n')

In [None]:
find_similar(df_politics, politics_embeddings_large)

In [None]:
find_similar(df_gaming, gaming_embeddings_large)

In [None]:
find_similar(df_marauders, marauders_embeddings_large)

In [None]:
find_similar(df_tarkov, tarkov_embeddings_large)

In [None]:
find_similar(df_satis, satisfactory_embeddings_large)