In [None]:
# imports for using the notebook

import pandas as pd
import sys
sys.path.append('../')
from embedding_functions_hugo.embedding_functions import *
from sklearn.preprocessing import StandardScaler
import numpy as np
from numpy import genfromtxt
import nltk
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import seaborn as sns

In [None]:
# Specifying datasets from different reddit pages

# df_gaming = pd.read_csv('../data/scrapes/gaming.csv')
# df_satis = pd.read_csv('../data/scrapes/SatisfactoryGame.csv')
# df_marauders = pd.read_csv('../data/scrapes/MaraudersGame.csv')
# df_tarkov = pd.read_csv('../data/scrapes/EscapefromTarkov.csv')
df_politics = pd.read_csv('../data/scrapes/politics.csv')


# Datasets post cleaning the text
# df_politics['cleaned_text'] = prep_pipeline(df_politics, 'comment_text')
# df_politics['short'] = shorten_sens(df_politics['cleaned_text'], 50)

# Function to speed up the process: 

def shorten_and_clean_dataset (comment_csv, comment_column : str, desired_comment_length : int):
    dataframe = pd.read_csv(comment_csv)
    dataframe['cleaned_text'] = prep_pipeline(dataframe, comment_column)
    dataframe['short'] = shorten_sens(dataframe['cleaned_text'], desired_comment_length)
    return dataframe

In [None]:
# sen_leng = []
# for i in df_politics['short']:
#     sen_leng.append(len(i.split()))


# print(np.percentile(sen_leng, 25))
# print(np.percentile(sen_leng, 50))
# print(np.percentile(sen_leng, 75))
# print(np.percentile(sen_leng, 99))
# print(np.mean(sen_leng))
# print(np.median(sen_leng))

In [None]:
# authors = df_politics.values[:,-2]

In [None]:
def save_embeddings_as_npy(destination_path : str, comment_csv, comment_column : str, desired_comment_length : int):
    from numpy import savetxt
    '''
    Nlp pipeline function which takes a pandas dataframe and relevant columns, performs preprocessing steps, uses sentence_transformer embeddings and saves the embeddings as a csv file.
    '''
    sentences = shorten_and_clean_dataset(comment_csv, comment_column, desired_comment_length)
    embeddings = embed_comments(sentences['short'])
    return np.save(destination_path, embeddings)
   # return savetxt(destination_path, embeddings, delimiter = ',')

### UNCOMMENT BELOW TO DO EMBEDDINGS AND SAVE THEM

# save_embeddings_as_npy('../data/embeddings/politics_embeddings.npy', '../data/scrapes/politics.csv', 'comment_text', 50)
# save_embeddings_as_npy('../data/embeddings/gaming_embeddings.npy', '../data/scrapes/gaming.csv', 'comment_text', 50)
save_embeddings_as_npy('../data/embeddings/marauders_embeddings.npy', '../data/scrapes/MaraudersGame.csv', 'comment_text', 50)
# save_embeddings_as_npy('../data/embeddings/tarkov_embeddings.npy', '../data/scrapes/EscapefromTarkov.csv', 'comment_text', 50)
# save_embeddings_as_npy('../data/embeddings/satisfactory_embeddings.npy', '../data/scrapes/SatisfactoryGame.csv', 'comment_text', 50)

In [None]:
def pair_users_embeddings(dataframe, embeddings, average_out_comments = False):
    usernames = dataframe['comment_author']
    user_dictionary = {}
    for author, embedded_comment in zip(usernames, embeddings):
        if author not in user_dictionary.keys():
            user_dictionary[author] = []
            user_dictionary[author].append(embedded_comment)
        else:
            user_dictionary[author].append(embedded_comment)
    if average_out_comments:
        for user in user_dictionary:
            number_or_comments = len(user_dictionary[user])
            user_dictionary[user] = sum(user_dictionary[user])/number_or_comments
    return user_dictionary

In [None]:
# LOADING EMBEDDINGS FROM FILES

politics_embeddings = np.load('../data/embeddings/politics_embeddings.npy')
gaming_embeddings = np.load('../data/embeddings/gaming_embeddings.npy')
marauders_embeddings = np.load('../data/embeddings/marauders_embeddings.npy')
tarkov_embeddings = np.load('../data/embeddings/tarkov_embeddings.npy')

# include below when the satisfactory embeddings are done
# satisfactory_embeddings = np.load('../data/embeddings/satisfactory_embeddings.npy')

In [None]:
politics_user_embeddings = pair_users_embeddings(df_politics, politics_embeddings, True)

In [None]:
pca = PCA(n_components=2)
two_dimensional_embeddings = pca.fit_transform(list(politics_user_embeddings.values()))

kmeans = KMeans(n_clusters=2, random_state=0)

classes = kmeans.fit_predict(two_dimensional_embeddings)

In [None]:
def reduce_dims_and_kmeans(user_embedding_pairs, num_of_dimensions):
    '''
    Current version only works w 2 colors. 
    '''
    # Set PCA to desired number of dimensions
    pca = PCA(n_components=num_of_dimensions)


    pca_embeddings = pca.fit_transform(list(user_embedding_pairs.values()))

    kmeans = KMeans(n_clusters=2, random_state=0) 

    classes = kmeans.fit_predict(pca_embeddings)


    label_color_map = {0 : 'r',1 : 'g'}
    label_color = [label_color_map[l] for l in classes]
    plt.scatter(pca_embeddings[:,0], pca_embeddings[:,1], c=label_color)

In [None]:
reduce_dims_and_kmeans(politics_user_embeddings, 2)