In [None]:
# imports for using the notebook

import pandas as pd
import sys
sys.path.append('../')
from embedding_functions_hugo.embedding_functions import *
from sklearn.preprocessing import StandardScaler
import numpy as np
from numpy import genfromtxt
import nltk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans, AgglomerativeClustering, FeatureAgglomeration
from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
# Specifying datasets from different reddit pages

df_gaming = pd.read_csv('../data/scrapes/gaming.csv')
df_satis = pd.read_csv('../data/scrapes/SatisfactoryGame.csv')
df_marauders = pd.read_csv('../data/scrapes/MaraudersGame.csv')
df_tarkov = pd.read_csv('../data/scrapes/EscapefromTarkov.csv')
df_politics = pd.read_csv('../data/scrapes/politics.csv')


# Datasets post cleaning the text
# df_politics['cleaned_text'] = prep_pipeline(df_politics, 'comment_text')
# df_politics['short'] = shorten_sens(df_politics['cleaned_text'], 50)

# Function to speed up the process: 

def shorten_and_clean_dataset (comment_csv, comment_column : str, desired_comment_length : int):
    dataframe = pd.read_csv(comment_csv)
    dataframe['cleaned_text'] = prep_pipeline(dataframe, comment_column)
    dataframe['short'] = shorten_sens(dataframe['cleaned_text'], desired_comment_length)
    return dataframe

In [None]:
# sen_leng = []
# for i in df_politics['short']:
#     sen_leng.append(len(i.split()))


# print(np.percentile(sen_leng, 25))
# print(np.percentile(sen_leng, 50))
# print(np.percentile(sen_leng, 75))
# print(np.percentile(sen_leng, 99))
# print(np.mean(sen_leng))
# print(np.median(sen_leng))

In [None]:
# authors = df_politics.values[:,-2]

In [None]:
def save_embeddings_as_npy(destination_path : str, comment_csv, comment_column : str, desired_comment_length : int):
    '''
    Nlp pipeline function which takes a pandas dataframe and relevant columns, performs preprocessing steps, uses sentence_transformer embeddings and saves the embeddings as a csv file.
    '''
    sentences = shorten_and_clean_dataset(comment_csv, comment_column, desired_comment_length)
    embeddings = embed_comments(sentences['short'])
    return np.save(destination_path, embeddings)
   # return savetxt(destination_path, embeddings, delimiter = ',')

### UNCOMMENT BELOW TO DO EMBEDDINGS AND SAVE THEM

# save_embeddings_as_npy('../data/embeddings/politics_embeddings.npy', '../data/scrapes/politics.csv', 'comment_text', 50)
# save_embeddings_as_npy('../data/embeddings/gaming_embeddings.npy', '../data/scrapes/gaming.csv', 'comment_text', 50)
# save_embeddings_as_npy('../data/embeddings/marauders_embeddings.npy', '../data/scrapes/MaraudersGame.csv', 'comment_text', 50)
# save_embeddings_as_npy('../data/embeddings/tarkov_embeddings.npy', '../data/scrapes/EscapefromTarkov.csv', 'comment_text', 50)
# save_embeddings_as_npy('../data/embeddings/satisfactory_embeddings.npy', '../data/scrapes/SatisfactoryGame.csv', 'comment_text', 50)

In [None]:
def pair_users_embeddings(dataframe, embeddings, average_out_comments = False):
    usernames = dataframe['comment_author']
    user_dictionary = {}
    for author, embedded_comment in zip(usernames, embeddings):
        if author not in user_dictionary.keys():
            user_dictionary[author] = []
            user_dictionary[author].append(embedded_comment)
        else:
            user_dictionary[author].append(embedded_comment)
    if average_out_comments:
        for user in user_dictionary:
            number_or_comments = len(user_dictionary[user])
            user_dictionary[user] = sum(user_dictionary[user])/number_or_comments
    return user_dictionary

In [None]:
# LOADING EMBEDDINGS FROM FILES

politics_embeddings = np.load('../data/embeddings/politics_embeddings.npy')
gaming_embeddings = np.load('../data/embeddings/gaming_embeddings.npy')
marauders_embeddings = np.load('../data/embeddings/marauders_embeddings.npy')
tarkov_embeddings = np.load('../data/embeddings/tarkov_embeddings.npy')

# include below when the satisfactory embeddings are done
# satisfactory_embeddings = np.load('../data/embeddings/satisfactory_embeddings.npy')

In [None]:
politics_user_embeddings = pair_users_embeddings(df_politics, politics_embeddings, True)

In [None]:
# pca = PCA(n_components=2)
# two_dimensional_embeddings = pca.fit_transform(list(politics_user_embeddings.values()))

# kmeans = KMeans(n_clusters=2)

# classes = kmeans.fit_predict(two_dimensional_embeddings)

In [None]:
def reduce_dims_and_kmeans(user_embedding_pairs, num_of_dimensions):
    '''
    Current version only works w 2 colors. 
    '''
    # Set PCA to desired number of dimensions
    pca = PCA(n_components=num_of_dimensions)


    pca_embeddings = pca.fit_transform(list(user_embedding_pairs.values()))

    kmeans = KMeans(n_clusters=2, random_state=0) 

    classes = kmeans.fit_predict(pca_embeddings)


    label_color_map = {0 : 'r',1 : 'g'}
    label_color = [label_color_map[l] for l in classes]
    plt.scatter(pca_embeddings[:,0], pca_embeddings[:,1], c=label_color)

In [None]:
reduce_dims_and_kmeans(politics_user_embeddings, 2)

In [None]:
def reduce_to_one_dimension_kmeans(user_embedding_pairs):
    '''
    Current version only works w 2 colors. 
    '''
    # Set PCA to desired number of dimensions
    pca = PCA(n_components=1)


    pca_embeddings = pca.fit_transform(list(user_embedding_pairs.values()))

    kmeans = KMeans(n_clusters=2, random_state=0) 

    classes = kmeans.fit_predict(pca_embeddings)

    return (user_embedding_pairs.keys(), pca_embeddings)

politics_user_embeddings = pair_users_embeddings(df_politics, politics_embeddings, True)
Squeem = reduce_to_one_dimension_kmeans(politics_user_embeddings)
x_axis = []
for name,emb in zip(Squeem[0], Squeem[1]):
    x_axis.append([str(name),emb])

def sortie(beb):
    return beb[0]
x_axis = sorted(x_axis, key=sortie)
len(x_axis)

In [None]:
# Test for making new dataframe with columns: 
# should be exportable to networkx - preferably edgelist
# Author (node_id),pca_x-axis , post_id

authors = []
x_axis = []
posts = []

# df_politics = df_politics.drop('post_text', axis=1).dropna()
df_politics

In [None]:
df_politics = df_politics.sort_values(by='comment_author')
for ball in df_politics.iloc:
    ID, Author = ball[2], ball[6]
    authors.append(Author)
    posts.append(ID)
print(authors, posts)

### Comment inspection with regards to distance to eachother

#### quick prep

In [None]:
# Set PCA to desired number of dimensions
pca = PCA(n_components=2)


pca_embeddings = pca.fit_transform(list(politics_user_embeddings.values()))

kmeans = KMeans(n_clusters=2, random_state=0)

classes = kmeans.fit_predict(pca_embeddings)

#### horizontally distant

In [None]:
#print(pca_embeddings.shape)
#print(len(politics_user_embeddings.keys()), len(politics_user_embeddings.values()))

# finding indexes of rows with least and max x values

x_vals = []
for idx, row in enumerate(pca_embeddings):
    x_val = row[0]
    x_vals.append(x_val)

# least x
least_x = min(x_vals)
least_x_index = np.argmin(x_vals)
least_x_username = list(politics_user_embeddings.keys())[least_x_index]
least_x_comments = df_politics.loc[df_politics['comment_author'] == least_x_username]

max_x = max(x_vals)
max_x_index = np.argmax(x_vals)
max_x_username = list(politics_user_embeddings.keys())[max_x_index]
max_x_comments = df_politics.loc[df_politics['comment_author'] == max_x_username]

#print(least_x, least_x_index, least_x_username)
#print(max_x, max_x_index, max_x_username)

#print(df_politics.shape)
#print(len(politics_user_embeddings.keys()))
print(least_x_comments['comment_text'].values[0])
print()
print(max_x_comments['comment_text'].values[0])

#### vertically distant

In [None]:
#print(pca_embeddings.shape)
#print(len(politics_user_embeddings.keys()), len(politics_user_embeddings.values()))

# finding indexes of rows with least and max y values

y_vals = []
for idx, row in enumerate(pca_embeddings):
    y_val = row[1]
    y_vals.append(y_val)

# least y
least_y = min(y_vals)
least_y_index = np.argmin(y_vals)
least_y_username = list(politics_user_embeddings.keys())[least_y_index]
least_y_comments = df_politics.loc[df_politics['comment_author'] == least_y_username]

max_y = max(y_vals)
max_y_index = np.argmax(y_vals)
max_y_username = list(politics_user_embeddings.keys())[max_y_index]
max_y_comments = df_politics.loc[df_politics['comment_author'] == max_y_username]

#print(least_y, least_y_index, least_y_username)
#print(max_y, max_y_index, max_y_username)

#print(df_politics.shape)
#print(len(politics_user_embeddings.keys()))
print(least_y_comments['comment_text'].values[0])
print()
print(max_y_comments['comment_text'].values[0])

#### most distant (in progress, will likely be expensive af to calc if checking all distances)

#### similar comments (in progress)

In [None]:
def find_similar(df, embeddings):
    '''inputs:
        - df: df to work with
        - embeddings: embeddings to work with
       
       function finds all users that fit in the limits and are therefore similar,
       then prints their comments'''
    
    # pairing embeddings
    user_embeddings = pair_users_embeddings(df, embeddings, True)
    
    # doing pca things
    pca = PCA(n_components=2)
    pca_embeddings = pca.fit_transform(list(user_embeddings.values()))
    kmeans = KMeans(n_clusters=2, random_state=0)
    classes = kmeans.fit_predict(pca_embeddings)
    
    # print blob
    label_color_map = {0 : 'r',1 : 'g'}
    label_color = [label_color_map[l] for l in classes]
    plt.scatter(pca_embeddings[:,0], pca_embeddings[:,1], c=label_color)
    
    # finding similar things
    # NEED TO CODE FOR FINDING ALL FAR THINGS
    to_check = ['MIDDLE', 'LEFT', 'RIGHT', 'TOP', 'BOTTOM']
    
    for i in to_check:
        
        # finding x and y limits based off of blob
        
        if i == 'MIDDLE':
            print('========== MIDDLE ==========')
            x_lims = [-0.1, 0.1]
            y_lims = [-0.1, 0.1]
            
            similar_indexes = list()
            for idx, row in enumerate(pca_embeddings):
                x_val = row[0]
                y_val = row[1]
                
                if x_val > x_lims[0] and x_val < x_lims[1] and y_val > y_lims[0] and y_val < y_lims[1]:
                    similar_indexes.append(idx)
        
        # checks from far left and finds first 5 comments
        elif i == 'LEFT':
            print('========== LEFT ==========')
            # get list of x coords for sorting
            x_coords = list()
            for idx, row in enumerate(pca_embeddings):
                x_coords.append(row[0])
            
            # sorts x coords by ascending, but gives the indexes not the values
            sorted_indexes = np.argsort(x_coords)
            
            similar_indexes = sorted_indexes[:5]
        
        # checks from far left and finds first 5 comments
        elif i == 'RIGHT':
            print('========== RIGHT ==========')
            # get list of x coords for sorting
            x_coords = list()
            for idx, row in enumerate(pca_embeddings):
                x_coords.append(row[0])
            
            # sorts x coords by descending, but gives the indexes not the values
            initial_sort = np.argsort(x_coords)
            
            similar_indexes = initial_sort[::-1][:5] # 5 for first 5 comments
        
        # checks from far top and finds first 5 comments
        elif i == 'TOP':
            print('========== TOP ==========')
            # get list of y coords for sorting
            y_coords = list()
            for idx, row in enumerate(pca_embeddings):
                y_coords.append(row[1])
            
            # sorts y coords by descending, but gives the indexes not the values
            initial_sort = np.argsort(y_coords)
            
            similar_indexes = initial_sort[::-1][:5] # 5 for first 5 comments
        
        elif i == 'BOTTOM':
            print('========== BOTTOM ==========')
            # get list of y coords for sorting
            y_coords = list()
            for idx, row in enumerate(pca_embeddings):
                y_coords.append(row[1])
            
            # sorts y coords by ascending, but gives the indexes not the values
            sorted_indexes = np.argsort(y_coords)
            
            similar_indexes = sorted_indexes[:5]
            
        # using list of similar indexes, matches with users and prints their comments
        usernames = list()
        for index in similar_indexes:
            username = list(user_embeddings.keys())[index]
            usernames.append(username)

        # cleaning comments to get relevant ones in embedding space
        df['cleaned_text'] = prep_pipeline(df, 'comment_text')
        df['short'] = shorten_sens(df['cleaned_text'], 50)
        
        for username in usernames:
            comments = df.loc[df['comment_author'] == username]
            #print(comments['comment_text'].values[0], '\n')
            print(comments['short'].values[0], '\n')

In [None]:
df_politics.columns

In [None]:
find_similar(df_politics, politics_embeddings)

In [None]:
find_similar(df_gaming, gaming_embeddings)

In [None]:
find_similar(df_marauders, marauders_embeddings)