In [None]:
import matplotlib.pyplot as plt
import numpy as np
import json
import datetime
import unicodedata
from pylab import rcParams
import os
import emoji

In [None]:
rcParams['figure.figsize'] = 16,10
rcParams['font.size'] = 20

In [None]:
class Reaction:
    def __init__ (self, reaction_dict):
        self.actor=reaction_dict["actor"]
        self.reaction=reaction_dict["reaction"]

In [None]:
class Message:
    def __init__ (self, message_dict):
        if "content" in message_dict.keys():
            self.text=message_dict["content"]       #text of message
        elif "photos" in message_dict.keys():
            self.text="photo"
        elif "audio_files" in message_dict.keys():
            self.text="audio"
        else:
            self.text=""
        self.senderName=message_dict["sender_name"]      #sender's name
        self.time=datetime.datetime.fromtimestamp(message_dict["timestamp_ms"]//1000)      #time when message was sent
        self.reactions=[]
        if "reactions" in message_dict.keys():
            for reaction in message_dict["reactions"]:
                self.reactions.append(Reaction(reaction))

In [None]:
class Thread:
    def __init__ (self, allData):
        self.participants=[]
        self.messages=[]
        self.title=allData["title"]
        for participant in allData["participants"]:
            self.participants.append(participant["name"])
        for message in allData["messages"]:
            self.messages.append(Message(message))
        self.messageCount=len(self.messages)
        self.messages=sorted(self.messages,key=lambda message: message.time)
    
    def messagesUntil (self, date):
        count=0
        for message in self.messages:
            if message.time<date:
                count+=1
        return count

In [None]:
threads=[]
base = os.path.join(os.getcwd(), "messages", "inbox")
dirs = next(os.walk(base))[1]
for s in dirs:
    dir=os.path.join(base, s)
    try:
        f = open(os.path.join(dir, "message_" + str(1) + ".json"), 'r', encoding='utf8')
        s = f.read().encode().decode('raw_unicode_escape').encode('latin1').decode('utf8')
        js = json.loads(s, strict=False)
        i = 2
        while True:
            try:
                f = open(os.path.join(dir, "message_" + str(i) + ".json"), 'r', encoding='utf8')
            except:
                break
            i += 1
            s = f.read().encode().decode('raw_unicode_escape').encode('latin1').decode('utf8')
            n = json.loads(s, strict=False)
            js["messages"].extend(n["messages"])
        threads.append(Thread(js))
    except:
        continue
threads=sorted(threads, key=lambda thread: thread.messageCount, reverse=True)
del(s,n,js,f,i)

In [None]:
def printFirsts (n):
    numbers=[]
    for thread in threads:
        if thread.messageCount<20:
            break
        numbers.append(thread.messageCount)

    for index in range(n):
        print("%d. %s: %d messages" % (index+1, threads[index].title, numbers[index]))
    
    fig, ax = plt.subplots()

    plt.yscale("log")
    plt.grid(True)
    ax.plot(range(1,len(numbers)+1), numbers, "r")
    plt.xlabel("Index of thread")
    plt.ylabel("Number of messages")
    plt.title("Number of messages in each thread")
    plt.show()

    fig, ax = plt.subplots()

    numbers2=[threads[0].messageCount]
    for thread in threads[1:]:
        numbers2.append(numbers2[-1]+thread.messageCount)

    
    
    plt.yscale("linear")
    plt.grid(True)
    ax.plot(range(1,len(numbers2[:50])+1), np.array(numbers2[:50])/numbers2[-1], ".-r")
    plt.xlabel("Index of thread")
    plt.ylabel("Portion of messages up to that index")
    plt.title("Portion of total messages represented up to each index")
    plt.show()
    

In [None]:
def findExtremeDates(threadList, startDate=datetime.datetime(1,1,1,0,0,0)):
    firstDate=threadList[0].messages[0].time
    lastDate=threadList[0].messages[-1].time
    for thread in threadList[1:]:
        if thread.messages[0].time<firstDate:
            firstDate=thread.messages[0].time
        if thread.messages[-1].time>lastDate:
            lastDate=thread.messages[-1].time
    return max(firstDate,startDate),lastDate

In [None]:
def messagesTime (threadList, startDate=datetime.datetime(1,1,1,0,0,0)):
    N = 1000
    firstDate, lastDate = findExtremeDates(threadList, startDate=startDate)
    totalDelta = lastDate - firstDate
    fig, ax = plt.subplots()

    for thread in threadList:
        buckets = np.zeros(N)
        for message in thread.messages:
            if message.time<firstDate:
                continue
            buckets[int((message.time - firstDate) / totalDelta * (N - 1))] += 1
        buckets = np.cumsum(buckets)
        ax.plot(buckets, label=thread.title)

    ax.legend()
    if(lastDate.year-firstDate.year>4):
        delta=365
    elif((lastDate-firstDate).days>30*30):
        delta=182
    elif((lastDate-firstDate).days>18*30):
        delta=92
    elif((lastDate-firstDate).days>10*30):
        delta=61
    else:
        delta=30    
    dateNames=[firstDate+datetime.timedelta(days=n*delta) for n in range(100) if firstDate+datetime.timedelta(days=(n-0.5)*delta)<lastDate]
    ax.set_xticks([(date-firstDate)/(lastDate-firstDate)*1000 for date in dateNames],[str(date.year)+"-"+str(date.month).zfill(2) for date in dateNames])

    plt.xticks(fontsize=15, rotation = 90)
    plt.xlabel("Date")
    plt.ylabel("Number of messages")
    plt.title("Number of messages in time")
    plt.grid(True)

In [None]:
def firstAndLastElements(listIn, topN, bottomN):
    if topN == None and bottomN==None:
        return listIn
    if topN == None:
        return listIn[-bottomN:]
    if bottomN == None:
        return listIn[:topN]
    if topN+bottomN>=len(listIn):
        return listIn
    return listIn[:topN]+ listIn[-bottomN:]

In [None]:
def senderStats (threadList, topN=None, bottomN=None):
    statsPerSender = {}

    for thread in threadList:
        for message in thread.messages:
                emojis = emoji.emoji_count(message.text)
                words = len(message.text.split())
                letters = len(message.text) - emojis - words + 1
                if message.senderName in statsPerSender:
                    statsPerSender[message.senderName]['letter_count'] += letters
                    statsPerSender[message.senderName]['emoji_count'] += emojis
                    statsPerSender[message.senderName]['word_count'] += words
                    statsPerSender[message.senderName]['message_count'] += 1
                    statsPerSender[message.senderName]['photo_count'] += int(message.text=="photo")
                    statsPerSender[message.senderName]['audio_count'] += int(message.text=="audio")
                else:
                    statsPerSender[message.senderName] = {'letter_count': letters, 'emoji_count': emojis, 'word_count': words, 'message_count': 1, 'photo_count': int(message.text=="photo"), 'audio_count': int(message.text=="audio")}

    for s in statsPerSender:
        statsPerSender[s]['avg_word_len'] =    round(statsPerSender[s]['letter_count'] / max(statsPerSender[s]['word_count'],1) , 2)
        statsPerSender[s]['avg_message_len'] = round(statsPerSender[s]['letter_count'] / max(statsPerSender[s]['message_count'],1), 2)


    print("Number of messages")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['message_count'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['message_count'])

    print("\nNumber of photos")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['photo_count'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['photo_count'])

    print("\nNumber of audio files")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['audio_count'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['audio_count'])
    
    print("\nNumber of characters")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['letter_count'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['letter_count'])

    print("\nEmojik száma")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['emoji_count'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['emoji_count'])

    print("\nNumber of words")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['word_count'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['word_count'])

    print("\nAverage length of words")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['avg_word_len'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['avg_word_len'], " characters")

    print("\nAverage length of messages in characters")
    for s in firstAndLastElements(sorted(statsPerSender.keys(), key=lambda x: statsPerSender[x]['avg_message_len'], reverse=True), topN, bottomN):
        print(s, ": ", statsPerSender[s]['avg_message_len'])
    print("\n")

In [None]:
senderStats(threads, 3)

In [None]:
def messagesTimeOfDay(threadList, separate=False, ax=None, relative=False):
    if(separate):
        fig, ax = plt.subplots()
        for thread in threadList:
            messagesTimeOfDay([thread], ax=ax, relative=relative)
        plt.legend()
        return
    
    quarters = np.zeros(24 * 4)

    for thread in threadList:
        for message in thread.messages:
            index = message.time.hour * 4 + message.time.minute // 15
            quarters[index] += 1

    kernel = np.array([0.09672046549155401, 0.24044903390287659, 0.3256610012111388, 0.24044903390287659, 0.09672046549155401])
    r = kernel.size // 2
    smoothed = np.zeros(24 * 4)
    for i in range(24 * 4):
        s = 0
        for j in range(-r, r + 1):
            s += kernel[j + r] * quarters[(i + j) % (24 * 4)]
        smoothed[i] = s / 15
            
    x = [i / 4 for i in range(24 * 4 + 1)]
    smoothed=smoothed.tolist() + [smoothed[0]]
    if (relative):
        smoothed=smoothed/np.trapz(smoothed)
    if (ax==None):
        fig, ax = plt.subplots()
        ax.plot(x, smoothed,"r")
    else:
        ax.plot(x, smoothed, label=threadList[0].title)
    plt.xlabel("Hours of the day")
    if(relative):
        plt.ylabel("Relative frequency of messages")
    else:
        plt.ylabel("Number of messages")
    plt.title("Distribution of messages within a day")
    plt.xticks([0,4,8,12,16,20,24])

    plt.xlim(0, 24.00)
    plt.grid(True)


In [None]:
def standardizeTexts(threadList):
    standardTexts = []
    for thread in threadList:
        for message in thread.messages:
            if message.text != "":
                message2 = message
                text = message.text
                message2.text = unicodedata.normalize('NFD', text).encode('ascii', 'ignore').lower().decode("utf-8") 
                standardTexts.append(message2)
    return standardTexts

In [None]:
def searchCount(standardTexts, key):
    key = unicodedata.normalize('NFD', key).encode('ascii', 'ignore').lower().decode("utf-8") 
    result = dict()
    for message in standardTexts:
        c = message.text.count(key)
        if (message.senderName in result):
            result[message.senderName]['count'] += c
        else:
            result[message.senderName] = {'count': c}

    return result

In [None]:
def wordNumber (threadList, key, topN=None):
    standardTexts = standardizeTexts(threadList)
    hits = searchCount(standardTexts, key)

    print("Number of occurances of \"" + str(key) + "\"")
    for s in firstAndLastElements(sorted(hits.keys(), key=lambda x: hits[x]['count'], reverse=True),topN,None):
        print(s, ": ", hits[s]['count'])
    print("\n")

In [None]:
def getWordCounts(threadList, separate):
    wordCounts = dict()
    standardTexts=standardizeTexts(threadList)

    for message in standardTexts:
        words = message.text.split()
        sender=message.senderName
        if(not separate):
            sender="Sum of all users"
        for word in words:
            if (sender in wordCounts.keys()):
                if (word in wordCounts[sender].keys()):
                    wordCounts[sender][word] += 1
                else:
                    wordCounts[sender][word] = 1
            else:
                wordCounts[sender] = {word: 1}
    return wordCounts

In [None]:
def harmonicNumber (s, N):
    return sum([1/k**s for k in range(1,N+1)])

In [None]:
def mostUsedWords(threadList, separate=False, first=0, last=11, ZipfPlot=True):
    wordCounts=getWordCounts(threadList, separate)
    print("Most used words:")
    for sender in wordCounts.keys():
        print(sender)
        for word in sorted(wordCounts[sender], key=wordCounts[sender].get, reverse=True)[first:last]:
            print(word, ":", wordCounts[sender][word])
        print()
    
    if(not ZipfPlot):
        return
    
    if(separate):
        print("to recieve distribution plot, set separate=False")
        return
    
    numbers=[]
    for word in sorted(wordCounts["Sum of all users"], key=wordCounts["Sum of all users"].get, reverse=True):
        if(wordCounts["Sum of all users"][word]<5):
            break
        numbers.append(wordCounts["Sum of all users"][word])
    area=np.trapz(numbers)
    numbers=numbers/area
    N=len(numbers)
    
    H_N=harmonicNumber(1,N)
    Zipf=[1/k/H_N for k in range(1,N+1)]

    fig, ax=plt.subplots()
    plt.yscale("log")
    plt.xscale("log")
    plt.grid(True)
    ax.plot(range(1,N+1), numbers, "r", label="distribution of words")
    ax.plot(range(1,N+1), Zipf, "k", label="Zipf distribution (with s=1)")

    plt.legend()
    plt.xlabel("Index of word")
    plt.ylabel("Relative frequency")
    plt.title("Zipf's law in action")

In [None]:
def timeElapsed(resp, threadList):
    
    responseTimes=[]

    for thread in threadList:
        lastSender=thread.messages[0].senderName
        lastTime=thread.messages[0].time
        for message in thread.messages:
            if (lastSender!=resp and message.senderName==resp):
                responseTimes.append((message.time-lastTime).total_seconds())
            lastSender=message.senderName
            lastTime=message.time
    return responseTimes

In [None]:
def histplot(responseTimes, responder, ax):
    counts,bins= np.histogram(responseTimes, bins=30, range=(0,300))
    portion=sum(counts)*100/len(responseTimes)
    print("%.1f%% of %s's answers come within 5 minutes" % (portion, responder))

    counts2,bins2 = np.histogram(responseTimes, bins=30, range=(0,60))
    portion2=sum(counts2)*100/len(responseTimes)
    print("%.1f%% of %s's answers come within 1 minutes" % (portion2, responder))
    
    respo2=[x if x<=300 else 320 for x in responseTimes]
    ax.hist(respo2, 100, range=(0,320), density=True, histtype='step',cumulative=True, label="responses of %s" % responder)

In [None]:
def getParticipants (threadList):
    participants=set([])
    for thread in threadList:
        participants = participants | set(thread.participants)
    return participants

In [None]:
def responseTime(threadList):
    fig, ax = plt.subplots()
    for responder in getParticipants(threadList):
        responseTimes=timeElapsed(responder, threadList)
        histplot(responseTimes, responder, ax)


    plt.title("Cumulative distribution function of response times")
    plt.xlabel("Response time (sec)")
    plt.ylabel("Portion of responses")
    plt.xlim(0,300)
    plt.legend(loc="lower right")
    plt.grid(True)
    plt.show()

In [None]:
def mostReacted (threadList, n):
    nBest = []
    titles = []
    threadListSorted=sorted(threadList, key=lambda thread: thread.participants, reverse=True)
    for thread in threadListSorted:
        for message in thread.messages:
            if len(message.reactions)==0:
                continue
            if len(nBest)<n:
                nBest.append(message)
                titles.append(thread.title)
                if len(nBest)==n:
                    titles, nBest=list(zip(*[(x,y) for y,x in sorted(zip(nBest,titles), key = lambda mesti: len(mesti[0].reactions), reverse=True)]))
                continue
            if len(message.reactions) > len(nBest[n-1].reactions):
                titles=[titles[i] for i in range(n) if len(nBest[i].reactions)>=len(message.reactions)] + [thread.title] + [titles[i] for i in range(n-1) if len(nBest[i].reactions)<len(message.reactions)]
                nBest=[mes for mes in nBest if len(mes.reactions)>=len(message.reactions)] + [message] + [mes for mes in nBest[:-1] if len(mes.reactions)<len(message.reactions)]
    
    for index, message in enumerate(nBest):
        print("%d. %s (%d reactions):\n%s\n%s, [%s]\n" % (index+1, message.senderName, len(message.reactions), message.text, titles[index], str(message.time)))

In [None]:
def threadByTitle(title):
    for thread in threads:
        if(thread.title==title):
            return thread
    print("Title not found")
    return None

In [None]:
def testCases():
    printFirsts(5)
    messagesTime(threads[:10])
    senderStats([threads[0]], topN=1)
    messagesTimeOfDay(threads)
    wordNumber([threads[0]], "egyenlore")
    mostUsedWords(threads, separate=False)
    mostUsedWords([threads[0]], separate=True, ZipfPlot=False)
    responseTime([threads[0]])

In [None]:
#mostReacted([threadByTitle("Kvótációk")],5)

In [None]:
testCases()

In [None]:
printFirsts(30)

In [None]:
messagesTime([threads[0]])

In [None]:
senderStats([threads[0]],topN=5)

In [None]:
messagesTimeOfDay(threads[:3], separate=True, relative=True)