In [None]:
from IPython.display import HTML

HTML('''<script>
code_show=true; 
function code_toggle() {
 if (code_show){
 $('div.input').hide();
 } else {
 $('div.input').show();
 }
 code_show = !code_show
} 
$( document ).ready(code_toggle);
</script>
<form action="javascript:code_toggle()"><input type="submit" value="Click here to toggle on/off the raw code."></form>''')

In [None]:
# This is a template notebook for extracting the project disk summary. To run this, do the following.

# 1. Set an environment variable called PROJECT and set it to the project name that you want to study.
# 2. Go to the terminal and run `jupyter-nbconvert --ExecutePreprocessor.timeout=-1 --execute ProjectDiskSummary-Template.ipynb`

# Please note that extracting the disk info using 'find' command works only in linux. If you are in windows, you'll have to
# generate that file outside of this notebook.

In [None]:
import os
import time
import pandas as pd
import subprocess
import matplotlib.pyplot as plt
from IPython.core.display import display, HTML

In [None]:
os.environ['PROJECT']= 'PW1000G_IMC_VAI'

In [None]:
project = os.environ['PROJECT']

In [None]:
folder_path = '/project/{}'.format(project)

In [None]:
%matplotlib inline

In [None]:
## Some useful functions

In [None]:
# Function to display text in large size
def shout(message, size='h1', color='black', css=""):
    return display(HTML('<{0} style="color:{2};{3}">{1}</{0}>'.format(size, message, color, css)))

# Funciton to return the username from userid
import pwd
def finger(id):
    try:
        return pwd.getpwuid(int(id)).pw_gecos
    except:
        return 'yy' + str(id)
    
#increase the field width
pd.set_option('display.max_colwidth',-1)

In [None]:
shout(
    "Project disk summary for {}".format(
        folder_path), 
    color="grey", 
    size="h1")

Any questions, contact [Najeem Muhammed](mailto:najeem.muhammed@gknaerospace.com)

In [None]:
## Generating the file list

In [None]:
out_file = os.path.abspath('/project/nobackup/Shared_VAI/DiskSpaceSummary/files_list_{}.out'.format(project))

In [None]:
shout(
    "Generating the file data for analysis for all files in folder {}........".format(
        folder_path), 
    color="grey", 
    size="h3")

In [None]:
%%time
# The following command works only in linux
!cd $folder_path; find -printf '%h|%f|%s|%U|%a|%t|%d|%S|%y\n' >$out_file 2>/dev/null

In [None]:
## Reading the Data

In [None]:
files = pd.read_csv(out_file, sep='|', header=None, low_memory=False)

In [None]:
files = files.dropna()

In [None]:
## Setting the dataframe columns

In [None]:
files.columns = ['Folder', 'File', 'Size', 'User', 'DateAccessed', 'DateModified', 'Depth', 'Sparseness', 'ItemType']

In [None]:
## Converting the date from string to date objects

In [None]:
files.DateAccessed = pd.to_datetime(files.DateAccessed)
files.DateModified = pd.to_datetime(files.DateModified)

In [None]:
## Converting the size from bytes to Gbs

In [None]:
files.Size = files.Size.apply(lambda x: round(x/2**30, 2))

In [None]:
## Finding the username from user ID

In [None]:
files["UserName"] = files.User.apply(finger)

In [None]:
## Finding the project folder and file extension

In [None]:
files["Type"] = files.File.apply(lambda x: os.path.splitext(x)[-1])

In [None]:
## New field to show the access/modified months only

In [None]:
files["Junk"] = False
# File extensions which are usually junk files
files.loc[files.Type.str.lower().isin([".esav", ".osav", ".emat", ".page", ".dbb", ".full", '.dab', '.cmf']), ["Junk"]] = True
# File that are 6 characters long and with mixed case
files.loc[(files.Type.str.contains("[A-Z]", regex=True)) & (files.Type.apply(lambda x: len(x) == 7)), ["Junk"]] = True
# Files which are backup files
files.loc[files.Type.str.contains("~"), ["Junk"]] = True
# Files which are backup files
files.loc[files.Type.str.contains("ansabort.db"), ["Junk"]] = True
# Files which are backup files
files.loc[files.Type.str.contains("delete"), ["Junk"]] = True
# File extensions that are too long
files.loc[files.Type.apply(lambda x: len(x) > 10), ["Junk"]] = True
# LN files
files.loc[files.Type.str.lower().str.contains('ln[0-9]+', regex=True), ["Junk"]] = True
# LN files
files.loc[files.Type.str.lower().str.contains('pc[0-9]+', regex=True), ["Junk"]] = True
# Nonlinear diagnostics files
files.loc[files.Type.str.lower().str.contains('nr[0-9]+', regex=True), ["Junk"]] = True

In [None]:
fcount = files.Size.count()
fsize = files.Size.sum()
shout("{} files with a total size of {} Gb".format(fcount, round(fsize, 1)))

In [None]:
shout("10 Largest files")
files.nlargest(10, columns=["Size"])[["Size", "File", "Folder"]]

In [None]:
shout("Largest Folders (Size)")
files.groupby('Folder').Size.sum().nlargest(20).to_frame()

In [None]:
big_folders = list(files.groupby('Folder').Size.sum().nlargest(20).index)
files[files.Folder.isin(big_folders)].groupby("Folder").Size.sum().plot(kind='bar', figsize = (16, 5))

In [None]:
shout("Disk Usage by User")
ax = files.groupby('UserName').Size.sum().plot(kind='barh', grid=True, figsize=(16,len(files.User.unique())/3))
ax.set_xlabel("File size (Gb)");

In [None]:
topjunkcount = 50
shout("{} Largest junk files".format(topjunkcount))
topjunk = files[files.Junk].nlargest(topjunkcount, "Size")
shout("Total size : {}".format(round(topjunk.Size.sum(),1)))
topjunk

In [None]:
junks = files[files.Junk]
if len(junks):
    shout("Possible Junk Files, per user (Size)")
    junks.groupby("UserName").Size.sum().plot(kind='barh', grid=True, figsize=(16,len(junks.User.unique())))
    ax.set_xlabel("File size (Gb)");
else:
    shout("Hurray! No Junk Files!!")

In [None]:
shout("Disk Usage by major file type (Size)")
ax = files.groupby('Type').Size.sum().nlargest(10).plot(kind='barh', grid=True, figsize=(16,10))
ax.set_xlabel("File size (Gb)");

In [None]:
shout("Files Last Accessed Date (Size)")
ax = files.set_index("DateAccessed").resample('M').sum().Size.plot(drawstyle='steps', figsize=(16,5))
ax.set_xlabel("Date");

In [None]:
shout("Files Last Accessed time by type")
large_types = list(files.groupby("Type").Size.sum().nlargest(10).index)
files[files.Type.isin(large_types)].groupby(
    by=[files.DateAccessed.dt.to_period('M'), 'Type']
        ).Size.sum().unstack(level=1).plot(kind='bar', figsize=(16,5), stacked=True);

In [None]:
shout("Files last access times, grouped by Owner (Size)")
df = files.groupby(
    by=[files.DateAccessed.dt.to_period('M'), 'UserName']
        ).Size.sum().unstack(level=1).fillna(0)
for plot in df.plot(drawstyle="steps", figsize=(16,len(df.columns)*3), subplots=True, sharex=False, ):
    plot.set_title("")
plt.tight_layout()

In [None]:
shout("Files Modified Date (Size)")
ax = files.groupby(files.DateModified.dt.to_period('M')).Size.sum().plot(drawstyle='steps', figsize=(16,5))
ax.set_xlabel("Date (Year-Month)");

In [None]:
shout("Files Modified time by type (Size)")
large_types = list(files.groupby("Type").Size.sum().nlargest(10).index)
files[files.Type.isin(large_types)].groupby(
    by=[files.DateModified.dt.to_period('M'), 'Type']
        ).Size.sum().unstack(level=1).plot(kind='area', figsize=(16,5), stacked=True);

In [None]:
shout("Files Modified grouped by Owner")
df = files.groupby(
    by=[files.DateModified.dt.to_period('M'), 'UserName']
        ).Size.sum().unstack(level=1).fillna(0)
for plot in df.plot(drawstyle="steps", figsize=(16,len(df.columns)*3), subplots=True, sharex=False, ):
    plot.set_title("")
plt.tight_layout()

In [None]:
shout("Files Modified time during a day")
ax = files.DateModified.dt.hour.value_counts().sort_index().reindex(range(24)).plot(drawstyle="steps", figsize=(16,5))
ax.set_xlabel("Modified hour");

In [None]:
shout("Who fires run when? (RST file creation time in a day)")
ax = files[files.Type == '.rst'].groupby(by=[files.DateModified.dt.hour, 'UserName']).Size.count().unstack(level=1).plot(kind='bar', stacked=True, figsize=(16, 5))
ax.set_xlabel("Modified hour");
ax.legend(loc=9, bbox_to_anchor=(0.5, -0.15), ncol=6);

In [None]:
shout("File access time during the day (count)")
ax = files[files.Type == '.rst'].groupby(by=[files.DateAccessed.dt.hour, 'UserName']).Size.count().unstack(level=1).plot(kind='bar', stacked=True, figsize=(16, 5))
ax.set_xlabel("Accessed hour");
ax.legend(loc=9, bbox_to_anchor=(0.5, -0.15), ncol=6);

In [None]:
shout("File types and Users (Size)")
ax = files[files.Size > 1].groupby(by=['Type', 'UserName']).Size.sum().unstack(level=1).plot(kind='bar', stacked=True, figsize=(16, 5))
ax.set_xlabel("Type");
ax.legend(loc=9, bbox_to_anchor=(0.5, -0.15), ncol=6);

In [None]:
shout("Run time of each *.rst file")

In [None]:
import numpy as np

In [None]:
def runtime(outfile):
    found=False
    for line in open(outfile, "rb"):
        if b"Elapsed Time (sec)" in line:
            for t in line.split():
                try:
                    time=float(t)                    
                    found=True
                    break
                except ValueError:
                    pass
    if found:
        return time
    else:
        return np.nan

In [None]:
for index, row in files[files.Type == ".rst"].iterrows():
    out_path = os.path.join(folder_path, row["Folder"].strip(), os.path.splitext(row["File"].strip())[0] + ".out")
    ans_path = os.path.join(folder_path, row["Folder"].strip(), os.path.splitext(row["File"].strip())[0] + ".ans")
    if os.path.isfile(out_path):
        files.loc[index, "RunTime"] = runtime(out_path)
    if os.path.isfile(ans_path):
        files.loc[index, "AnsFile"] = True
    else:
        files.loc[index, "AnsFile"] = False

In [None]:
shout("Files which will take less than an hour to run")
shout("*AnsFile status will tell if the corresponding ansys run script is available or not", size="h3")
files.loc[files.RunTime < 3600, ["Size", "RunTime", "AnsFile", "Folder", "File"]].nlargest(50, "Size")