# PST File to Dataframe to JSON

This short script aims to read Outlook PST files and exract messages into a processable format such as a dataframe and saving it as JSON

In [None]:
#!pip install libpff-python

In [None]:
import pypff
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
pst = pypff.file()

# Add path to PST file
filename = "mailbox.pst"

pst.open(filename)

In [None]:
root = pst.get_root_folder()

In [None]:
def parse_folder(base):
    messages = []
    for folder in base.sub_folders:
        if folder.number_of_sub_folders:
            messages += parse_folder(folder)
        print(folder.name)
        for message in folder.sub_messages:
            messages.append({
                "subject": message.subject,
                "message": message.plain_text_body,
                "sender": message.sender_name,
                "datetime": message.client_submit_time
            })
    return messages

messages = parse_folder(root)

In [None]:
df = pd.DataFrame(messages)
df.head()

In [None]:
df.shape

In [None]:
# if message is of type bytes we have to decode it
for index, row in df.iterrows():
    if row['message'] is not None:
        row['message'] = str(row['message'].decode("utf-8"))

In [None]:
# example mail
df.iloc[8]['message'].decode()

## Save Dataframe as JSON

In [None]:
# save file to json
df.to_json(filename+"_converted.json")

## Visualize mailbox mails over time

In [None]:
df['datetime'] = df['datetime'].dt.tz_localize(tz='UTC')
df['datetime'] = df['datetime'].dt.tz_convert(tz='Europe/Paris')

In [None]:
df['hour'] = df['datetime'].dt.hour + df['datetime'].dt.minute / 60
df['date'] = df['datetime'].dt.year + df['datetime'].dt.dayofyear / 365
df['year'] = df['datetime'].dt.year

In [None]:
plt.clf()
ax = sns.scatterplot(x="date", y="hour", alpha=.4, marker=".", data=df)
ax.set(xlim=(df['year'].min(),df['year'].max()), ylim=(0,25))
ax.set_xlabel("year")
ax.set_ylabel("time of the day")
ax.invert_yaxis()
sns.despine()
ax.get_figure().savefig("plot.png", dpi=400)