In [1]:
import copy
import math
import matplotlib.pyplot as plt
import os
import re
import subprocess
import sys
import pandas as pd
import pickle
import time
from wordcloud import WordCloud

from ast import literal_eval
from io import StringIO
from IPython.display import display, clear_output
from matplotlib.ticker import FuncFormatter
import matplotlib.ticker as ticker

# to do
# 1. add -d ' ' to gufi_query so it uses space as delimiter
# 2. don't convert ranges to human-friendly until plot time; then remove the stupid code we added to revert them back
# 3. instead of just showing some mean values as a function of depth, try to show distributions at each depth
#    this might be something like one graph per file-system with a different line per depth or something like that
# 4. make the cumulative graphs for the depths

In [2]:
indexroots = ["/mnt/nvme3n1/jbent/scr4/", 
              "/mnt/nvme3n1/jbent/yellprojs/", 
              "/mnt/nvme3n1/jbent/ttscratch/", 
              "/mnt/nvme3n1/jbent/yellusers", 
              "/mnt/nvme3n1/jbent/anony"]
testir = indexroots[0]  
#testir = '/mnt/nvme3n1/jbent/jbent_home/'
nthreads = 224

In [3]:
def run_gufi_command(command,Verbose):    
    if Verbose:
        print(' '.join(command))
        
    start_time = time.time()
    completed_process = subprocess.run(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
    end_time = time.time()
    
    if completed_process.returncode != 0:
        print(f"An error occurred:\n{completed_process.stderr.decode('utf-8')}")
        return None
    else:
        output = completed_process.stdout.decode('utf-8')
        output = output.replace('\u001A', ' ') # some weird character in the output for some reason

        if Verbose:
            #print(f"Output:\n{output}")
            print(f"Elapsed time: {(end_time - start_time)/60:.2f} minutes")
        return output

def run_gufi_select(select, indexroot, nthreads, Verbose=False):
    cmd = 'gufi_query'
    command = [
        cmd,
        '-E', select,
        '-d', ' ',
        '-n', str(nthreads),
        indexroot
    ]
    return run_gufi_command(command,Verbose)

def run_gufi_aggregate(indexroot, nthreads, create_int, insert_int, create_agg, insert_agg, select_agg, Verbose=False):
    cmd = 'gufi_query'
    command = [
        cmd,
        "-I", create_int,
        "-S", insert_int,
        "-K", create_agg,
        "-J", insert_agg,
        "-G", select_agg,
        "-n", str(nthreads),
        '-d', ' ',
        indexroot
    ]
    return run_gufi_command(command,Verbose)

In [4]:
# returns a dataframe
def get_popular_extensions_by_pinode(Verbose):
    select = """
        SELECT pinode, 
               CASE 
                    WHEN name NOT LIKE '%.%' THEN 'Null' 
                    ELSE REPLACE(name, RTRIM(name, REPLACE(name, '.', '')), '') 
                END AS extension, 
                COUNT(*) AS count 
                FROM vrpentries 
                GROUP BY pinode,extension
                ORDER BY count DESC LIMIT 4;
        """
    output = run_gufi_select(select,testir,nthreads,Verbose)
    
    if Verbose: 
        # print 10 lines just for debugging
        print(*[f"{line}" for line in output.strip().split('\n')][:10])

    # Read the data into a DataFrame
    df = pd.read_csv(StringIO(output), sep=' ', header=None, names=['pinode', 'ext', 'count'])

    # Initialize columns for strings and integers
    for i in range(1, 5):
        df[f'ext_mode_{i}']  = None
        df[f'ext_count_{i}'] = pd.Series(dtype='Int64')

    # Assign values to the new columns
    for _, group in df.groupby('pinode'):
        indices = group.index
        for i, idx in enumerate(indices):
            df.at[idx, f'ext_mode_{i+1}']  = group.at[idx, 'ext']
            df.at[idx, f'ext_count_{i+1}'] = group.at[idx, 'count']

    # Drop the original 'string' and 'integer' columns
    df = df.drop(columns=['ext', 'count'])

    # Aggregate by 'pinode' and take the first non-null value in each column
    result_df = df.groupby('pinode').first().reset_index()
    result_df = result_df.set_index('pinode')

    return result_df # grouped

pin_ext_df = get_popular_extensions_by_pinode(True)

gufi_query -E 
        SELECT pinode, 
               CASE 
                    WHEN name NOT LIKE '%.%' THEN 'Null' 
                    ELSE REPLACE(name, RTRIM(name, REPLACE(name, '.', '')), '') 
                END AS extension, 
                COUNT(*) AS count 
                FROM vrpentries 
                GROUP BY pinode,extension
                ORDER BY count DESC LIMIT 4;
         -d   -n 224 /mnt/nvme3n1/jbent/jbent_home/
Elapsed time: 0.00 minutes
450835 Null 4 450835 json 1 450835 rst 1 450835 txt 1 442845 pyc 3 450898 pyi 2 450900 pyi 3 475171 js 1 467335 css 1 467335 js 1


In [5]:
# make it so we can see more
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_rows', 20)
pin_ext_df

Unnamed: 0_level_0,ext_mode_1,ext_count_1,ext_mode_2,ext_count_2,ext_mode_3,ext_count_3,ext_mode_4,ext_count_4
pinode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
396893,bash_history,1,bash_logout,1,bash_profile,1,bashrc,1
401763,Null,3,pub,1,,,,
401764,txt,2,gitignore,1,md,1,,
401765,Null,8,,,,,,
401766,Null,1,,,,,,
...,...,...,...,...,...,...,...,...
491644,Null,5,,,,,,
491645,Null,5,,,,,,
491646,Null,5,,,,,,
491647,Null,5,,,,,,


In [6]:
# just for fun, turn it into a word map
# first get a global count of each extension

def df_to_global_count(df,num_cols):
    # Assuming df is your DataFrame

    # Initialize a dictionary to store the sums
    sums = {}

    # Iterate over each row in the DataFrame
    for _, row in df.iterrows():
        # Iterate over each string-int pair
        for i in range(1, num_cols+1):
            str_col = f'ext_mode_{i}'
            int_col = f'ext_count_{i}'
            if pd.notna(row[str_col]) and pd.notna(row[int_col]):
                # Add to the sum in the dictionary
                sums[row[str_col]] = sums.get(row[str_col], 0) + row[int_col]

    return sums

sums = df_to_global_count(pin_ext_df,4)
print(len(sums))
with open('/tmp/ext_counts.pkl', 'wb') as file:
    pickle.dump(sums, file)

def to_wordcloud(word_counts):
    # Specify the path to the DejaVu Sans Regular font
    font_path = '/usr/share/fonts/dejavu/DejaVuSans.ttf'

    wordcloud = WordCloud(width=800, height=400, background_color='white', font_path=font_path).generate_from_frequencies(word_counts)

    # Display the word cloud using matplotlib
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation="bilinear")
    plt.axis("off")
    plt.show()

#to_wordcloud(sums)
print("For some reason, wordcloud doesn't work on hpe1. Just pickle it and make the wordcount elsewhere. Ugh.")

144
For some reason, wordcloud doesn't work on hpe1. Just pickle it and make the wordcount elsewhere. Ugh.


In [7]:
# now get the name and inode for each directory
def get_pinode_name_map(Verbose):
    itab='intermediate' # intermediate table
    atab='aggregate' # aggregate table
    stab='vrsummary' # source table
    cstr='(inode INT64, name TEXT)' # create string
    sstr='inode,name' # select string
    output = run_gufi_aggregate(
            indexroot  = testir, 
            nthreads   = f"{nthreads}",
            create_int = f"CREATE TABLE {itab} {cstr}",
            insert_int = f"INSERT INTO {itab} SELECT {sstr} FROM {stab}",
            create_agg = f"CREATE TABLE {atab} {cstr}",
            insert_agg = f"INSERT INTO {atab} SELECT {sstr} FROM {itab}",
            select_agg = f"SELECT {sstr} FROM {atab}",
            Verbose    = Verbose
    )
    # output is like "37 foo\n45 bar\nETC"; convert into dict and return that
    result_dict = {line.split()[0]: line.split()[1] for line in output.strip().split('\n')}
    return result_dict

# pin_name_map = get_pinode_name_map(False)

In [8]:
# do some sanity checking to see if the pinodes are the same between the two maps
# this doesn't work because we used to get ext counts into a dict and now we get into a dataframe
# so woould have to modify this to make it work again
check_sanity = False
if check_sanity:
    sanity = set(pin_ext_map.keys()) - set(pin_name_map.keys())
    if len(sanity) != 0:
        print("Some files have nonexistant parents")
        print(sanity)