In [None]:
import json
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import altair as alt
from altair import datum, expr

alt.data_transformers.disable_max_rows()

In [None]:
def build_dataframe(path='messages'):
    rows_list = []
    for filename in Path(path).glob('inbox/*/message_*.json'):
        chat = filename.parent.name
        with open(filename, 'r') as f:
            obj = json.load(f)
        for entry in obj['messages']:
            if entry['type'] == 'Generic' and entry.get('content') is not None:
                rows_list.append({
                    'chat': chat,
                    'sender': entry['sender_name'],
                    'time': pd.to_datetime(entry['timestamp_ms'], unit='ms'),
                    'content': entry['content'],
                })
    return pd.DataFrame(rows_list)

df = build_dataframe()

In [None]:
df

In [None]:
df.chat.value_counts()

In [None]:
alt.Chart(df).mark_line().encode(
    x='yearmonth(time):T',
    y='count()',
)

In [None]:
alt.Chart(df).mark_bar().encode(
    x='yearmonth(time):T',
    y='count()',
    color='chat:N',
    tooltip=['chat', 'count()'],
).properties(
    title='Number of Facebook Messages',
)

In [None]:
chat_id = 'hogdinstagrampromo_nn=eizkwpq'


alt.Chart(df.query(f'chat == "{chat_id}"')).mark_bar().encode(
    color='sender:N',
    x='yearmonth(time):T',
    y='count()',
    tooltip=['sender', 'count()'],
).properties(
    title='Facebook Messages in Group Chat',
)

In [None]:
alt.Chart(df).mark_bar().encode(
    alt.X('count()', stack='normalize', title='frequency'),
    alt.Y('chat'),
    alt.Color('sender'),
    tooltip=['sender', alt.Tooltip('count()', title='messages')],
).properties(
    title='Who Dominates the Conversation?',
)

In [None]:
sender = 'Huiwen Chen'

alt.Chart(df).mark_rect().encode(
    alt.X('date(time):O', title='day'),
    alt.Y('yearmonth(time):O', title='month'),
    alt.Color('count()', scale=alt.Scale(type='linear')),
    tooltip=[
        alt.Tooltip('count()', title='Messages'),
        alt.Tooltip('sum(words):Q', title='Words'),
    ],
).transform_filter(
    datum.sender == sender,
).transform_calculate(
    words=expr.length(expr.split(datum.content, ' ')),
).properties(
    title='Number of Messages Sent by Day',
)

In [None]:
#sorted bar graph 

alt.Chart(df).mark_bar().encode(
    x='count()',
    y=alt.Y('yearmonth(time):T', sort='-x')
).properties(
    title='Message Count Sorted From Greatest to Least',
)

In [None]:
#scatter plot
alt.Chart(df).mark_circle(size=80).encode(
    alt.X('date(time):O', title='day'),
    alt.Y('yearmonth(time):O', title='month'),
    tooltip=[
        alt.Tooltip('count()', title='Messages'),
        alt.Tooltip('sum(words):Q', title='Words'),
    ],
).transform_filter(
    datum.sender == sender,
).transform_calculate(
    words=expr.length(expr.split(datum.content, ' ')),
).interactive()