### Grab a chunk

In [4]:
import pandas as pd

# Load the existing DataFrame
df = pd.read_csv('rpg/rpg_small_preprocessed.csv')

# Define the number of rows you want to sample
X = 1000  # Replace 10 with the number of rows you want to sample

# Sample X number of rows from the DataFrame
df_sample = df.sample(n=X)

# Save the sampled DataFrame to a new Excel file in the same directory
df_sample.to_csv('rpg/analysis/rpg_small_mini.csv', index=False)


  df = pd.read_csv('rpg/rpg_small_preprocessed.csv')


### Split in two

In [1]:
import pandas as pd

# Define file paths
data_path = 'rpg/rpg_small_processed.csv'

# Read the CSV file
data = pd.read_csv(data_path)

# Get the total number of rows
total_rows = len(data)

# Calculate the split point
split_point = total_rows // 2

# Split the data into two parts
data_part1 = data.iloc[:split_point]
data_part2 = data.iloc[split_point:]

# Define output file paths
output_path1 = 'rpg/rpg_small_processed_part1.csv'
output_path2 = 'rpg/rpg_small_processed_part2.csv'

# Save the split data to new CSV files
data_part1.to_csv(output_path1, index=False)
data_part2.to_csv(output_path2, index=False)

  data = pd.read_csv(data_path)


## Make a model

In [3]:
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
from sentence_transformers import SentenceTransformer
import collections
from tqdm import tqdm
from scipy.cluster import hierarchy as sch
from nltk.tokenize import sent_tokenize
import numpy as np


# Define file paths
data_path = 'attachment/attach_short.csv'
embeddings_path = 'attachment/short/models/attach_short_embeddings.npy'
model_save_path = 'attachment/short/models/attach_short_model_dir/'

# Load the data
df = pd.read_csv(data_path, usecols=['text'], low_memory=False)

# Specify what the 'docs' are
docs = df['text'].tolist()

# Load the embeddings
embedding_model = SentenceTransformer('thenlper/gte-large')
embeddings = np.load(embeddings_path)

###### Extract vocab to be used in BERTopic
vocab = collections.Counter()
tokenizer = CountVectorizer(ngram_range=(1, 2)).build_tokenizer()
for doc in tqdm(docs):
    vocab.update(tokenizer(doc))
vocab = [word for word, frequency in vocab.items() if frequency >= 30]; len(vocab)


umap_model = UMAP(
        n_components=3,  # has a wild impact hard to predict
        n_neighbors=20,  # Higher is a more gloabl strcture
        min_dist=0.01,   # Lower value means more dense packing
        random_state=42, # Reproducability
        metric="cosine", # have to pick something
        n_jobs=-1        # speed
        )

hdbscan_model = HDBSCAN(
            min_cluster_size=15,           # smallest size group considered
            min_samples=10,               # larger is more conservative - more noise
            # leaf_size=40,                   # number of points per leaf node in the tree - default 40
            gen_min_span_tree=True,        # True creates minimum spanning trees - increasing RAM
            prediction_data=True,           # generates extra cached data of prediction labels for new data or reuse
            cluster_selection_method='eom', # eom is normal - leaf might get more homogeneous clusters
            # cluster_selection_epsilon=0.0,  # default - merges clusters below threshold
            core_dist_n_jobs=-1,            # For speed
            )

topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    verbose=True)
topics, probs = topic_model.fit_transform(docs, embeddings)

topic_model.save(model_save_path, serialization="safetensors", save_ctfidf=True, save_embedding_model=embedding_model)



100%|██████████| 1992/1992 [00:00<00:00, 1992619.50it/s]
2024-05-07 08:56:53,615 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-07 08:57:02,542 - BERTopic - Dimensionality - Completed ✓
2024-05-07 08:57:02,542 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-07 08:57:02,582 - BERTopic - Cluster - Completed ✓
2024-05-07 08:57:02,589 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-07 08:57:02,616 - BERTopic - Representation - Completed ✓


In [None]:
topic_model.get_topic_info()

# Random - ish

### Go through a folder system and pull out '.xyz' file types

In [2]:
import os
import shutil

# Define the directory path you want to search
directory_path = 'C:/Users/snake/OneDrive/grad_school'

# Define the file extensions to include
include_extensions = ('.txt', '.doc', '.docx')

# Define the destination directory
destination_directory = "C:/Users/snake/Downloads"

# Function to copy files to the destination directory
def copy_files_to_directory(file_list, destination):
    for file in file_list:
        shutil.copy(file, destination)

# Walk through the directory
for subdir, dirs, files in os.walk(directory_path):
    # Filter files with the desired extensions
    text_files = [os.path.join(subdir, file) for file in files if file.endswith(include_extensions)]
    
    # Copy the filtered files to the destination directory
    copy_files_to_directory(text_files, destination_directory)
    print(f"Copied {len(text_files)} files to {destination_directory}")

print("File copying process completed.")


Copied 2 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 1 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 2 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 7 files to C:/Users/snake/Downloads
Copied 2 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 5 files to C:/Users/snake/Downloads
Copied 1 files to C:/Users/snake/Downloads
Copied 4 files to C:/Users/snake/Downloads
Copied 1 files to C:/Users/snake/Downloads
Copied 1 files to C:/Users/snake/Downloads
Copied 1 files to C:/Users/snake/Downloads
Copied 1 files to C:/Users/snake/Downloads
Copied 3 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 1 files to C:/Users/snake/Downloads
Copied 0 files to C:/Users/snake/Downloads
Copied 2 files to C:/Users/snake/Downloads
Copied 0 fi

### Removes duplicates in a directory

In [3]:
import os

def find_and_remove_duplicates(directory_path):
    # Create a dictionary to store filenames and their sizes
    file_dict = {}

    # Walk through the directory
    for subdir, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(subdir, file)
            file_size = os.path.getsize(file_path)
            if file in file_dict:
                # Compare sizes and retain the largest file
                if file_size > file_dict[file][1]:
                    os.remove(file_dict[file][0])  # Delete the smaller file
                    file_dict[file] = (file_path, file_size)
                else:
                    os.remove(file_path)  # Delete the current file
            else:
                file_dict[file] = (file_path, file_size)

if __name__ == "__main__":
    target_directory = "C:/Users/snake/Downloads"
    find_and_remove_duplicates(target_directory)
    print("Duplicate files removed successfully.")


Duplicate files removed successfully.


### Converts .txt .doc and .docx files to .md

In [3]:
import pypandoc
from pathlib import Path

# Define the directory path containing the files
directory_path = 'C:/Users/snake/Downloads'

# Function to convert .docx to .md
def convert_docx_to_md(docx_path, md_path):
    pypandoc.convert_file(docx_path, 'md', outputfile=md_path)
    
# Function to convert .txt to .md
def convert_txt_to_md(txt_path, md_path):
    os.rename(txt_path, md_path)


# Walk through the directory and convert files
for subdir, dirs, files in os.walk(directory_path):
    for file in files:
        try:
            file_path = os.path.join(subdir, file)
            md_path = os.path.splitext(file_path)[0] + '.md'
            if file.lower().endswith('.docx'):
                convert_docx_to_md(file_path, md_path)
            elif file.lower().endswith('.txt'):
                convert_txt_to_md(file_path, md_path)
        except Exception as e:
            print(f'Failed to convert {file}: {e}')


Failed to convert Transcription.docx: Pandoc died with exitcode "63" during conversion: couldn't unpack docx container: Did not find end of central directory signature



## Combine LIWC Variables

In [2]:
import pickle
import pandas as pd
from bertopic import BERTopic

liwc_df = pd.read_csv('attachment/attach_processed_length10_LIWC.csv')

data_save_path = "attachment/doc/models/attach_doc1_data.pkl"

with open(data_save_path, "rb") as file:
    df = pickle.load(file)
    
combined_df = pd.concat([df, liwc_df], axis=1)

with open(data_save_path, "wb") as file:
    pickle.dump(combined_df, file)

In [3]:
import pandas as pd
import pickle


LIWC_save_path = "attachment/doc/models/attach_doc1_data.pkl"

with open(LIWC_save_path, "rb") as file:
    df = pickle.load(file)
    
print(df.columns.tolist())

['id', 'link_id', 'author', 'created_utc', 'subreddit', 'body', 'score', 'all_awardings', 'gildings', 'total_awards_received', 'author_flair_text', 'author_flair_richtext', 'text', 'permalink', 'title', 'selftext', 'treatment_tags', 'link_flair_text', 'link_flair_richtext', 'topics', 'Segment', 'WC', 'Analytic', 'Clout', 'Authentic', 'Tone', 'WPS', 'BigWords', 'Dic', 'Linguistic', 'function', 'pronoun', 'ppron', 'i', 'we', 'you', 'shehe', 'they', 'ipron', 'det', 'article', 'number', 'prep', 'auxverb', 'adverb', 'conj', 'negate', 'verb', 'adj', 'quantity', 'Drives', 'affiliation', 'achieve', 'power', 'Cognition', 'allnone', 'cogproc', 'insight', 'cause', 'discrep', 'tentat', 'certitude', 'differ', 'memory', 'Affect', 'tone_pos', 'tone_neg', 'emotion', 'emo_pos', 'emo_neg', 'emo_anx', 'emo_anger', 'emo_sad', 'swear', 'Social', 'socbehav', 'prosocial', 'polite', 'conflict', 'moral', 'comm', 'socrefs', 'family', 'friend', 'female', 'male', 'Culture', 'politic', 'ethnicity', 'tech', 'Lifest