# Facebook data 📈

First of all we need to fix encoding of message.json file. To do this, we will encode message to **latin-1** and decode to **utf8**.

In [None]:
import json

import re
import os


from PyQt6 import QtWidgets

import json
import os
from pathlib import Path 

def open_files():
    app = QtWidgets.QApplication([dir])
    fnames = QtWidgets.QFileDialog.getOpenFileNames(None, "Select a file...", '.', filter="All files (*)")
    return sorted(fnames[0], key=lambda name : int(name.split(".")[0].split("_")[-1]) )


file_names = open_files()
result_name = '{}_merged.json'.format(os.path.splitext(file_names[0])[0])

print('Working ...')

result_json = None

def processValue(value):
    if type(value) is dict:
        processDict(value)
    elif type(value) is list:
        for index, item in enumerate(value): 
            value[index] = processValue(item)
    elif type(value) is str:
        return value.encode('latin-1').decode('utf8')

    return value

def processDict(dictionary):
    for key, val in dictionary.items():
        dictionary[key] = processValue(val)
    
datas = None
for file_name in file_names:
    with open(file_name, "rb") as file:
        data_dict = json.load(file)

        processDict(data_dict)

        if not datas:
            datas = data_dict
        else:
            for key, val in data_dict.items():
                if key == "messages":
                    continue
                if datas[key] != val:
                    raise Exception("Message files contain different metadata")
                
            datas["messages"].extend(data_dict["messages"])

with open(result_name, 'w', encoding='utf8') as fout:
    json.dump(datas, fout, indent=2, ensure_ascii=False)

print('File created at {}'.format(result_name))

Create SQLITE database

Now we can start making some cool stats 💻 top **n messages** from conversation 📖

In [None]:
word_split_pattern = r'[ ,.?!\n]'

def splitMessageToWords(msg):
    parts = re.split(word_split_pattern, msg)

    return list(filter(None, parts))

In [None]:
import re
from itertools import islice

file = open(result_name, 'r', encoding='utf8')

msg_json = json.load(file)

# Count every word

messagesAll = {}
photos = []
videos = []

for msg in msg_json['messages']:
    if "content" in msg:
        tmp = []
       # tmp = re.findall(r"[\w']+", i['content']) not sure which is better 😶
        tmp = splitMessageToWords(msg['content'].lower())
        
        for word in tmp:
            if word in messagesAll:
                messagesAll[word] += 1
            else:
                messagesAll[word] = 1
    elif "photos" in msg:
        photos.append(msg)
    elif "videos" in msg:
        videos.append(msg)

sortedAll = sorted(messagesAll.items(), key=lambda kv: kv[1], reverse=True)

def first_n_from_iterable(n, iterable):
    """Return first n items of the iterable as a list"""
    return list(islice(iterable, n))

def get_top_messages(dict_msgs, n=10):
    """Return first n messages of dict"""

    print('\nTop {} :\n'.format(n))
    for key in first_n_from_iterable(n, dict_msgs):
        print(key)
        
get_top_messages(sortedAll, 50)

#TODO Top messages per month / year 📆 and create dictionary with month and total messages.

In [None]:
import datetime

messages_per_month_dict = {}
user_messages_per_month = {}

def get_messages_per_month():
    for i in msg_json['messages']:
        date = str(datetime.datetime.fromtimestamp(float(i['timestamp_ms'])/ 1000.0).strftime('%Y-%B'))
        sender = i['sender_name']
        
        if date in messages_per_month_dict:
            messages_per_month_dict[date] += 1
        else:
            messages_per_month_dict[date] = 1
        
        if sender in user_messages_per_month:
            if date in user_messages_per_month[sender]:
                user_messages_per_month[sender][date] += 1
            else:
                user_messages_per_month[sender][date] = 1
        else:
            user_messages_per_month[sender] = {}
            user_messages_per_month[sender][date] = 1
        
        
get_messages_per_month()

Top 5 per sender #TODO group chat 💥 

In [None]:
user_word_counts = {}
sorted_user_word_counts = {}

for msg in msg_json['messages']:
    try:
        tmp = []
       # tmp = re.findall(r"[\w']+", i['content']) # dont know which is better
        tmp = splitMessageToWords(msg['content'].lower())
        date = msg['sender_name']
        
        if date not in user_word_counts:
            user_word_counts[date] = {}
        
        for word in tmp:
            if word in user_word_counts[date]:
                user_word_counts[date][word] += 1
            else:
                user_word_counts[date][word] = 1
    except Exception:
        pass
    
for user in user_word_counts.keys():
    sorted_user_word_counts[user] = sorted(user_word_counts[user].items(), key=lambda kv: kv[1], reverse=True)
    
def get_top_messages_per_sender(dicts, n=10):
    """Return first n messages per sender"""
    
    for sender in dicts.keys():
        print('\nTop {} from : {}'.format(n, sender))
        for key in first_n_from_iterable(n, dicts[sender]):
            print(key)

get_top_messages_per_sender(sorted_user_word_counts, 5)

If your conversation contains some youtube videos (of cource does 😉) you can search for them and their titles! ▶

In [None]:
from urllib.request import urlopen
from lxml import etree

def get_youtube_titles_from_conversation():
 # delete to run this, otherwise its slow af
    videoCount = 0
    for i in msg_json['messages']:
        try:
            tmp = []
            tmp = i['content'].split()
            for word in tmp:
                if 'youtube.com/watch' in word:
                    try:
                        youtube = etree.HTML(urlopen(word).read())
                        video_title = youtube.xpath(
                            "//span[@id='eow-title']/@title")
                    except Exception as e:
                        print(e)
                        video_title = 'Error'

                    print('{} -> {}'.format(word, video_title))
                    videoCount += 1
        except Exception:
            pass
        
#get_youtube_titles_from_conversation()

Lets make some graphs(finally)! 🎉 

In [None]:
import plotly
import plotly.graph_objs as go

import pandas as pd

def key_fnc(elem):
    return datetime.datetime.strptime(elem, '%Y-%B')

x_axe = list(messages_per_month_dict.keys())
x_axe_sorted = sorted(x_axe, key=key_fnc)

y_axe = []
for key in x_axe_sorted:
    y_axe.append(messages_per_month_dict[key])


plotly.offline.plot({
    "data": [go.Scatter(x=x_axe_sorted, y=y_axe)],
    "layout": go.Layout(title="Month graph 🎉")
}, auto_open=True)

Per sender message graph 🎈

In [None]:
import plotly
import plotly.graph_objs as go

import pandas as pd

import random

def key_fnc(elem):
    return datetime.datetime.strptime(elem, '%Y-%B')

x_axe = list(messages_per_month_dict.keys())
x_axe_sorted = sorted(x_axe, key=key_fnc)

data = []

for date in user_messages_per_month.keys():
    y_sender = []
    for emojo in x_axe_sorted:
        if(emojo not in user_messages_per_month[date]):
            y_sender.append(0)
        else:
            y_sender.append(user_messages_per_month[date][emojo])

    r = lambda: random.randint(0,255)
    graph_color = '#%02X%02X%02X' % (r(),r(),r())
    
    sender_msg = go.Scatter(
                x=x_axe_sorted,
                y=y_sender,
                name = date,
                line = dict(color = graph_color),
                opacity = 0.8)
    data.append(sender_msg)

plotly.offline.plot({
    "data": data,
    "layout": go.Layout(title="Month graph 🎉")
}, auto_open=True)

In [None]:
def wordPatternCount(pattern):
    count = 0
    variants = []

    for word in messagesAll:
        if re.match(pattern, word):
            count += messagesAll[word]
            variants.append(word)

    return count, variants

be_pattern = r"\b(be)+$"
wordPatternCount(be_pattern)


In [None]:
fuj_pattern = r"\b(fu+j)+\b"
count, variants = wordPatternCount(fuj_pattern)

print(count, len(variants))
variants

In [None]:
fej_pattern = r"\b(fe+j)+\b"
count, variants = wordPatternCount(fej_pattern)

print(count, len(variants))
variants

In [None]:
otrasz_pattern = r"\b(otra(s|z))\b"
count, variants = wordPatternCount(otrasz_pattern)

print(count, len(variants))
variants

In [None]:
from collections import defaultdict
from datetime import datetime
import plotly
import plotly.graph_objs as go

import pandas as pd

import random

emoji_pattern = re.compile(r"[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F\U0001F780-\U0001F7FF\U0001F800-\U0001F8FF\U0001F900-\U0001F9FF\U0001FA00-\U0001FA6F\U0001FA70-\U0001FAFF\U00002600-\U000026FF\U00002700-\U000027BF\U00002300-\U000023FF\U00002B50]")

emojis = defaultdict(int)

for msg in msg_json["messages"]:
    if "content" in msg:
        matches = re.findall(emoji_pattern, msg['content'])
        if matches:
            for match in matches:
                emojis[match] += 1

print(sorted(emojis.items(), key=lambda kv : kv[1], reverse=True))

print(emojis.keys())
print(sum(emojis.values()))
print(len(emojis.keys()))

In [None]:
chronological_messages = sorted(msg_json["messages"], key= lambda x : x["timestamp_ms"])

start_date = datetime.fromtimestamp(float(chronological_messages[0]["timestamp_ms"])/ 1000.0)
end_date = datetime.fromtimestamp(float(chronological_messages[-1]["timestamp_ms"])/ 1000.0)

month = start_date.month
num_months = (12 - month) + (end_date.year - start_date.year - 1) * 12 + end_date.month

emojis_per_month_dict = {}

for m in range(month - 1, month + num_months + 1):
    mnth = m % 12 + 1
    yr =  start_date.year + m // 12

    emojis_per_month_dict[datetime(yr, mnth, 1, 0, 0, 0, 0).strftime('%Y-%B')] = defaultdict(defaultdict)


def get_emojis_per_month():
    for msg in chronological_messages:
        date = str(datetime.fromtimestamp(float(msg['timestamp_ms'])/ 1000.0).strftime('%Y-%B'))
        emoji_dict = emojis_per_month_dict[date]

        if "content" in msg:
            matches = re.findall(emoji_pattern, msg['content'])
            if matches:
                for match in matches:
                    if not match in emoji_dict:
                        emoji_dict[match] = 0

                    emoji_dict[match] += 1

        
get_emojis_per_month()



In [None]:
emojis_per_month_dict

In [None]:
def key_fnc(elem):
    return datetime.strptime(elem, '%Y-%B')

x_axe = list(emojis_per_month_dict.keys())
x_axe_sorted = sorted(x_axe, key=key_fnc)

data = []

for emojo in emojis.keys():
    y_emojo = []
    for date in emojis_per_month_dict:
        if(emojo not in emojis_per_month_dict[date]):
            y_emojo.append(0)
        else:
            y_emojo.append(emojis_per_month_dict[date][emojo])

    r = lambda: random.randint(0,255)
    graph_color = '#%02X%02X%02X' % (r(),r(),r())
    
    sender_msg = go.Scatter(
                x=x_axe_sorted,
                y=y_emojo,
                name = emojo,
                line = dict(color = graph_color),
                opacity = 0.8)
    data.append(sender_msg)

plotly.offline.plot({
    "data": data,
    "layout": go.Layout(title="EMOJOOOO graph 🎉")
}, auto_open=True)

# "layout": go.Layout(title="EMOJOOOO graph 🎉",yaxis=dict(
#         title="Emoji",
#         type="log",
#     ))

In [None]:
time = datetime.strptime("2020-November",'%Y-%B')

In [None]:
photos_by_sender = defaultdict(int)

for photo_msg in photos:
    photos_by_sender[photo_msg["sender_name"]] += len(photo_msg["photos"])

photos_by_sender


In [None]:
videos_by_sender = defaultdict(int)

for video_msg in videos:
    videos_by_sender[video_msg["sender_name"]] += len(video_msg["videos"])

videos_by_sender

In [None]:
for message in msg_json["messages"]:
    print(message["timestamp_ms"])

In [None]:
chronological_messages = sorted(msg_json["messages"], key= lambda x : x["timestamp_ms"])


In [None]:
convos = []

#8h             s    m    h    
treshold_ms = float(1000 * 60 * 60 * 8)

last_message_timestamp = float(chronological_messages[0]['timestamp_ms'])
convo = {"first_message" : chronological_messages[0], "start_index" : 0}
for index, message in enumerate(chronological_messages):
    timestamp = float(message['timestamp_ms'])

    if (timestamp - last_message_timestamp) >= treshold_ms:
        convo["last_message"] = chronological_messages[index - 1]
        convo["end_index"] = index - 1
        convo["timespan_ms"] = float(convo["last_message"]['timestamp_ms']) - float(convo["first_message"]['timestamp_ms'])
        convo["num_messages"] = convo['end_index'] - convo['start_index'] + 1
        convos.append(convo)
        convo = {"first_message" : message, "start_index" : index}

    last_message_timestamp = timestamp



In [None]:
print(len(convos))
longest_convo = max(convos, key=lambda cnv : cnv['timespan_ms'])
longest_convo["timespan_ms"] / (1000.0 * 60 * 60)
start = longest_convo["start_index"]
end = longest_convo["end_index"]

for i in range(start, end + 1):
    print(chronological_messages[i])

In [None]:
import shutil
import csv

ordered_photos = sorted(photos, key= lambda photo : photo['timestamp_ms'])

photos_per_month_dict = defaultdict(list)

for photo in ordered_photos:
    date = str(datetime.fromtimestamp(float(photo['timestamp_ms'])/ 1000.0).strftime('%Y-%B'))
    
    photos_per_month_dict[date].append(photo)

conversation_dir = os.path.split(result_name)[0]

for month, messages in photos_per_month_dict.items():
    month_directory = os.path.join(conversation_dir,"FDS_media", month)
    if not os.path.exists(month_directory):
        os.makedirs(month_directory)

    csv_file = open(os.path.join(month_directory, "data.csv"), "w")
    writer = csv.writer(csv_file, delimiter=",")
    writer.writerow(['file', 'sender', 'sent'])

    for msg in messages:
        phts = msg["photos"]
        
        date = str(datetime.fromtimestamp(float(msg['timestamp_ms'])/ 1000.0).strftime("%Y-%m-%d %H:%M:%S"))

        for pht in phts:
            uri = str(pht['uri'])
            if uri.startswith("http://") or uri.startswith("https://"):
                continue

            file_name = os.path.split(pht['uri'])[1]
           

            print(pht)
            photo_url = os.path.join(conversation_dir, "photos", file_name)
            try:
                shutil.copy2(photo_url, os.path.join(month_directory, file_name))
            except FileNotFoundError:
                file_name = "[NOT FOUND] " + file_name

            writer.writerow((file_name, msg['sender_name'], date))

    csv_file.close()


    directory_path = None

In [None]:
import shutil
import csv

media = []

for msg in msg_json['messages']:
    if "photos" in msg:
        media.append(msg)
    elif "videos" in msg:
        media.append(msg)

ordered_media = sorted(media, key= lambda med : med['timestamp_ms'])

media_per_month_dict = defaultdict(list)

for medium in media:
    date = str(datetime.fromtimestamp(float(medium['timestamp_ms'])/ 1000.0).strftime('%Y-%B'))
    
    media_per_month_dict[date].append(medium)

conversation_dir = os.path.split(result_name)[0]

for month, messages in media_per_month_dict.items():
    month_directory = os.path.join(conversation_dir,"FDS_media", month)
    if not os.path.exists(month_directory):
        os.makedirs(month_directory)

    csv_file = open(os.path.join(month_directory, "data.csv"), "w")
    writer = csv.writer(csv_file, delimiter=",")
    writer.writerow(['file', 'sender', 'sent'])

    for msg in messages:
        meds = None

        if 'photos' in msg:
            meds = msg["photos"]
        elif 'videos' in msg:
            meds = msg['videos']
        
        date = str(datetime.fromtimestamp(float(msg['timestamp_ms'])/ 1000.0).strftime("%Y-%m-%d %H:%M:%S"))

        for med in meds:
            uri = str(med['uri'])
            if uri.startswith("http://") or uri.startswith("https://"):
                continue

            file_name = os.path.split(med['uri'])[1]
           
            print(med)
            photo_url = os.path.join(conversation_dir, "photos" if 'photos' in msg else "videos", file_name)
            try:
                shutil.copy2(photo_url, os.path.join(month_directory, file_name))
            except FileNotFoundError:
                file_name = "[NOT FOUND] " + file_name

            writer.writerow((file_name, msg['sender_name'], date))

    csv_file.close()


    directory_path = None