# Facebook Message Analyzer

<b> Current Features For a Given Chat: </b>
<ul> 
    <li> Number of Messages Sent </li> 
    <li> Messages Sent Over Time </li> 
    <li> Average Word Count </li>
</ul>

In [11]:
import os
import json
import numpy as np
import pylab as pl
import datetime

CURRENT_DIRECTORY = os.getcwd()
NUMBER_TO_ANALYZE = 5000
MESSAGE_THRESHOLD = 100

In [47]:
def get_json_data(chat):
    try:
        json_location = CURRENT_DIRECTORY + "/messages/" + chat + "/message.json"
        with open(json_location) as json_file:
            json_data = json.load(json_file)
            return json_data
    except IOError:
        pass # some things the directory aren't messages (DS_Store, stickers_used, etc.)

In [48]:
chats = os.listdir(CURRENT_DIRECTORY + "/messages/")[:NUMBER_TO_ANALYZE]
sorted_chats = []
final_data_messages = {}
final_data_times = {}
final_data_words = {}
invalid_message_count = 0

In [49]:
print('Analyzing ' + str(min(NUMBER_TO_ANALYZE, len(chats))) + ' chats...')

for chat in chats:
    url = chat + '/message.json'
    json_data = get_json_data(chat)
    print(chat)
    if json_data != None:
        messages = json_data["messages"]
        if len(messages) >= MESSAGE_THRESHOLD:
            sorted_chats.append((len(messages), chat, messages))

sorted_chats.sort(reverse=True)

print('Finished processing chats...')

Analyzing 370 chats...
melodywang_79ffcdb74d
sydneypenny_b360a07eba
larrydang_0374805458
willboffa_30e49500df
jackrogers_49d1ae6e6b
mocktrial11_fe6e16e30d
faithonyechere_c9abc12cee
minaoates_dce209e5eb
ryanturner_db575761da
FrancescaLuluandOwen_09f281b2da
briancowhey_3613fcdb5b
tramnguyen_7bf4f7df7a
MockTrialquad_51801f8c0b
jackswanson_0415bb4f2b
isabelleemerson_1d329234f2
lucyrickerich_0e319405d4
josiecarter_8ff14484ea
justintseng_5464e1294e
maxwellnoahwebsterand6others_0510b65b1b
chloelevine_a5ca070142
philipbrittan_c65347722d
ruthjaensubhakij_c36d01b9ce
macrinawang_9f287df295
hannahjackson_e80ba4a339
nourkhachemoune_7429df1651
willaking_77f2d00eff
zehanzhou_ca48672541
alyssakim_c3059f6476
joshuabeirich_10b1d2f9b8
MatthewYuan_2a37f15efe
MabrySansbury_7233e445ba
juliuscaesardebate_86c3bad97f
maddiemaier_68a10adbc3
johnetchart_6de6af0d86
rachellindsay_51844d9de0
clarakim_e81c03c326
JustinWilcox_80056d243b
harrisonabram_c4f6a42e98
victoriaroseknight_16851711a8
emmasegel_1d017f38be
riamo

In [51]:
for i, (messages, chat, messages) in enumerate(sorted_chats):
    number_messages = {}
    person_to_times = {}
    number_words = {}

    print(str(i) + " - " + str(len(messages)) + " messages - " + str(chat))
    
    print(messages)
    
    for message in messages:
        try:
            name = message["sender_name"]
            time = message["timestamp_ms"]
            message_content = message["content"]

            number_messages[name] = number_messages.get(name, 0)
            number_messages[name] += 1

            person_to_times[name] = person_to_times.get(name, [])
            person_to_times[name].append(datetime.datetime.fromtimestamp(time/1000.0))

            number_words[name] = number_words.get(name, [])
            number_words[name].append(len(message_content.split()))
        except KeyError:
            # happens for special cases like users who deactivated, unfriended, blocked
            invalid_message_count += 1

    final_data_messages[i] = number_messages
    final_data_times[i] = person_to_times
    final_data_words[i] = number_words

print('Found ' + str(invalid_message_count) + ' invalid messages...')
print('Found ' + str(len(sorted_chats)) + ' chats with ' + str(MESSAGE_THRESHOLD) + ' messages or more')

0 - 1803 messages - PhilipLee_67ba44e6a3
1 - 1656 messages - JaccsBigMistake_2f59b58406
[{'sender_name': 'Ife Omidiran', 'timestamp_ms': 1531090134240, 'content': 'Ife Omidiran left the group.', 'type': 'Unsubscribe', 'users': [{'name': 'Ife Omidiran'}]}, {'sender_name': 'Riley Hoveland', 'timestamp_ms': 1530933881481, 'content': 'Same can someone add me???', 'type': 'Generic'}, {'sender_name': 'Fahima Begum', 'timestamp_ms': 1530933870589, 'content': "Oh oops I'm not in the chat", 'type': 'Generic'}, {'sender_name': 'Kendrick Foster', 'timestamp_ms': 1530933646842, 'content': 'We havenâ\x80\x99t video chatted in at least a week though', 'type': 'Generic'}, {'sender_name': 'Fahima Begum', 'timestamp_ms': 1530932931783, 'content': 'I think the video chat is more active', 'type': 'Generic'}, {'sender_name': 'Mohammed Mutaher', 'timestamp_ms': 1530932811126, 'content': 'Is this chat dead?', 'reactions': [{'reaction': 'ð\x9f\x98¢', 'actor': 'Rachel Reynolds'}], 'type': 'Generic'}, {'sender

In [None]:
def plot_num_messages(chat_number):
    plotted_data = final_data_messages[chat_number]
    X = np.arange(len(plotted_data))
    pl.bar(X, list(plotted_data.values()), align='center', width=0.5, color = 'r', bottom = 0.3)
    pl.xticks(X, plotted_data.keys(), rotation = 90)
    pl.title('Number of Messages Sent')
    pl.tight_layout()
    pl.show()
    
def plot_histogram_time(chat_number):
    person_to_times = final_data_times[chat_number]
    pl.xlabel('Time')
    pl.ylabel('Number of Messages')
    pl.title('# of Messages Over Time')
    colors = ['b', 'r', 'c', 'm', 'y', 'k', 'w', 'g']
    for i , person in enumerate(person_to_times):
        plotted_data = person_to_times[person]
        pl.hist(plotted_data, 100, alpha=0.3, label=person, facecolor=colors[i % len(colors)])
    pl.legend()
    pl.xticks(rotation=90)
    pl.tight_layout()
    pl.show()

def plot_histogram_words(chat_number):
    temp = {}
    for person in final_data_words[chat_number]:
        temp[person] = np.average(final_data_words[chat_number][person])
    plotted_data = temp
    X = np.arange(len(plotted_data))
    pl.bar(X, list(plotted_data.values()), align='center', width=0.5, color = 'r', bottom = 0.3)
    pl.xticks(X, plotted_data.keys(), rotation = 90)
    pl.title('Average Word Count')
    pl.tight_layout()
    pl.show()
    
def plot(chat_number):
    plot_num_messages(chat_number)
    plot_histogram_time(chat_number)
    plot_histogram_words(chat_number)

In [None]:
plot(2)