# Facebook data 📈

First of all we need to fix encoding of message.json file. To do this, we will encode message to **latin-1** and decode to **utf8**.

In [None]:
from PyQt5 import QtGui
from PyQt5 import QtGui, QtWidgets

import json
import os
from pathlib import Path 

def open_file():
    app = QtWidgets.QApplication([dir])
    fname = QtWidgets.QFileDialog.getOpenFileName(None, "Select a file...", '.', filter="All files (*)")
    return fname[0]
    
fname = open_file()
filename = '{}_fixed.json'.format(os.path.splitext(fname)[0])


print('Working ...')

file = open(fname, 'r')  # or change filename to 'message.json'

jsonToFix = json.load(file)

for i in jsonToFix['messages']:
    try:
        i['sender_name'] = i['sender_name'].encode('latin-1').decode('utf8')
        i['content'] = i['content'].encode('latin-1').decode('utf8')
    except Exception:
        pass

jsonToFix['title'] = jsonToFix['title'].encode('latin-1').decode('utf8')

with open(filename, 'w', encoding='utf8') as fout:
    json.dump(jsonToFix, fout, indent=2, ensure_ascii=False)

print('File created at {}'.format(filename))

Now we can start making some cool stats 💻 top **n messages** from conversation 📖

In [None]:
import re
from itertools import islice


file = open(filename, 'r', encoding='utf8')

msg_json = json.load(file)

# Count every word

messagesAll = {}

for i in msg_json['messages']:
    try:
        tmp = []
       # tmp = re.findall(r"[\w']+", i['content']) not sure which is better 😶
        tmp = i['content'].split()
        for word in tmp:
            if word in messagesAll:
                messagesAll[word] += 1
            else:
                messagesAll[word] = 1
    except Exception:
        pass

sortedAll = sorted(messagesAll.items(), key=lambda kv: kv[1], reverse=True)

def first_n_from_iterable(n, iterable):
    """Return first n items of the iterable as a list"""
    return list(islice(iterable, n))

def get_top_messages(dict_msgs, n=10):
    """Return first n messages of dict"""

    print('\nTop {} :\n'.format(n))
    for key in first_n_from_iterable(n, dict_msgs):
        print(key)
        
get_top_messages(sortedAll, 5)

#TODO Top messages per month / year 📆 and create dictionary with month and total messages.

In [None]:
import datetime

messages_per_month_dict = {}
user_messages_per_month = {}

def get_messages_per_month():
    for i in msg_json['messages']:
        date = str(datetime.datetime.fromtimestamp(float(i['timestamp'])).strftime('%Y-%B'))
        sender = i['sender_name']
        
        if date in messages_per_month_dict:
            messages_per_month_dict[date] += 1
        else:
            messages_per_month_dict[date] = 1
        
        if sender in user_messages_per_month:
            if date in user_messages_per_month[sender]:
                user_messages_per_month[sender][date] += 1
            else:
                user_messages_per_month[sender][date] = 1
        else:
            user_messages_per_month[sender] = {}
            user_messages_per_month[sender][date] = 1
        
        
get_messages_per_month()

Top 5 per sender #TODO group chat 💥 

In [None]:
user_word_counts = {}
sorted_user_word_counts = {}

for i in msg_json['messages']:
    try:
        tmp = []
       # tmp = re.findall(r"[\w']+", i['content']) # dont know which is better
        tmp = i['content'].split()
        sender = i['sender_name']
        
        if sender not in user_word_counts:
            user_word_counts[sender] = {}
        
        for word in tmp:
            if word in user_word_counts[sender]:
                user_word_counts[sender][word] += 1
            else:
                user_word_counts[sender][word] = 1
    except Exception:
        pass
    
for user in user_word_counts.keys():
    sorted_user_word_counts[user] = sorted(user_word_counts[user].items(), key=lambda kv: kv[1], reverse=True)
    
def get_top_messages_per_sender(dicts, n=10):
    """Return first n messages per sender"""
    
    for sender in dicts.keys():
        print('\nTop {} from : {}'.format(n, sender))
        for key in first_n_from_iterable(n, dicts[sender]):
            print(key)

get_top_messages_per_sender(sorted_user_word_counts, 5)

If your conversation contains some youtube videos (of cource does 😉) you can search for them and their titles! ▶

In [None]:
from urllib.request import urlopen
from lxml import etree

def get_youtube_titles_from_conversation():
 # delete to run this, otherwise its slow af
    videoCount = 0
    for i in msg_json['messages']:
        try:
            tmp = []
            tmp = i['content'].split()
            for word in tmp:
                if 'youtube.com/watch' in word:
                    try:
                        youtube = etree.HTML(urlopen(word).read())
                        video_title = youtube.xpath(
                            "//span[@id='eow-title']/@title")
                    except Exception as e:
                        print(e)
                        video_title = 'Error'

                    print('{} -> {}'.format(word, video_title))
                    videoCount += 1
        except Exception:
            pass
        
#get_youtube_titles_from_conversation()

Lets make some graphs(finally)! 🎉 

In [None]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

import pandas as pd

def key_fnc(elem):
    return datetime.datetime.strptime(elem, '%Y-%B')

def add_month(date):
    tmp = date.replace(day=1)
    tmp = tmp + datetime.timedelta(days=32)
    return tmp.replace(day=1)

x_axe = list(messages_per_month_dict.keys())
x_axe_sorted = sorted(x_axe, key=key_fnc)

date = datetime.datetime.strptime(x_axe_sorted[0],'%Y-%B')
end = datetime.datetime.strptime(x_axe_sorted[-1],'%Y-%B')

x_axe_sorted = []

while date != end:
    x_axe_sorted.append(date.strftime('%Y-%B'))
    date = add_month(date)
    
y_axe = []
for key in x_axe_sorted:
    if key in messages_per_month_dict:
        y_axe.append(messages_per_month_dict[str(key)])
    else:
        y_axe.append(0)

plotly.offline.plot({
    "data": [go.Scatter(x=x_axe_sorted, y=y_axe)],
    "layout": go.Layout(title="Month graph 🎉")
}, auto_open=True)

Per sender message graph 🎈

In [None]:
import plotly
import plotly.plotly as py
import plotly.graph_objs as go

import pandas as pd

import random

def key_fnc(elem):
    return datetime.datetime.strptime(elem, '%Y-%B')

x_axe = list(messages_per_month_dict.keys())
x_axe_sorted = sorted(x_axe, key=key_fnc)

date = datetime.datetime.strptime(x_axe_sorted[0],'%Y-%B')
end = datetime.datetime.strptime(x_axe_sorted[-1],'%Y-%B')

x_axe_sorted = []

while date != end:
    x_axe_sorted.append(date.strftime('%Y-%B'))
    date = add_month(date)

data = []

for sender in user_messages_per_month.keys():
    y_sender = []
    for date in x_axe_sorted:
        if(date not in user_messages_per_month[sender]):
            y_sender.append(0)
        else:
            y_sender.append(user_messages_per_month[sender][date])

    r = lambda: random.randint(0,255)
    graph_color = '#%02X%02X%02X' % (r(),r(),r())
    
    sender_msg = go.Scatter(
                x=x_axe_sorted,
                y=y_sender,
                name = sender,
                line = dict(color = graph_color),
                opacity = 0.8)
    data.append(sender_msg)

plotly.offline.plot({
    "data": data,
    "layout": go.Layout(title="Month graph 🎉")
}, auto_open=True)