In [None]:
import pandas as pd

In [None]:
from nltk.corpus import stopwords
from nltk.corpus import names
import json
import re
import plotly.express as px
import plotly.graph_objects as go
import plotly as py
import emoji
from IPython.display import Markdown as md

In [None]:
from pandas.plotting import register_matplotlib_converters
register_matplotlib_converters()

In [None]:
with open('settings.json', 'r') as f:
    settings = json.load(f)
    
chatPath = settings["chatPath"]
includeNames = settings["includeNames"]
excludeNames = settings["excludeNames"]

In [None]:
with open(chatPath, encoding="utf-8") as fp:
    split = re.findall('\d+.\d+.\d+, \d+:\d+ - .*', fp.read())
    parsedData = []
    for x in split:
        TimeMessageSplit = x.split(' - ')
        TimeDateSplit = TimeMessageSplit[0].split(', ')
        MessageAuthorSplit = TimeMessageSplit[1].split(': ')
        if len(MessageAuthorSplit) == 1:
            Author = None
            Message = MessageAuthorSplit[0]
        else:
            Author = MessageAuthorSplit[0]
            Message = MessageAuthorSplit[1]
        parsedData.append([TimeDateSplit[0], TimeDateSplit[1], Author, Message])

In [None]:
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message'])

In [None]:
df['Length'] = (df['Message'].dropna()).apply(lambda x : len(x))
df["Author"] = df["Author"].replace(to_replace=r'[+].*', value='Others', regex=True)
df['Date'] = pd.to_datetime(df['Date'],format="%d.%m.%y")
df = df.dropna()

In [None]:
text = "\n".join(df[df["Message"]!="<Medien ausgeschlossen>"]["Message"])

# WordCounts

In [None]:
filtered = {}
customFilter = ["n't", "https", "..", "\"" , "''", "'s"]
filter = stopwords.words("english") + stopwords.words("german") + customFilter

import nltk

words = nltk.word_tokenize(text)
words = [emoji.get_emoji_regexp().sub(r'', x) for x in words]
words = [x.lower() for x in words]
words = [x for x in words if len(x) > 1 and x not in filter]

fdist = nltk.FreqDist(words)

In [None]:
sortedCounts = pd.DataFrame(fdist.most_common(30), columns=["Word","Count"])
import plotly.express as px

fig = px.bar(sortedCounts, x='Count', y='Word', color='Count', orientation="h", height=1000)
fig.update_layout(
    yaxis=dict(autorange="reversed")
)
fig.show()

# Which Names were mentioned most

In [None]:
maleNames = list(map(lambda x:x.lower(), names.words('male.txt')))
femaleNames = list(map(lambda x:x.lower(), names.words('female.txt')))

In [None]:
import spacy

nlp = spacy.load("de_core_news_sm")

import numpy as np

df_NoMedia = df.replace("<Medien ausgeschlossen>", np.NaN).dropna()
df_NoMedia['Tags'] = df["Message"].apply(lambda x : [(token.text,token.label_) for token in nlp(x).ents])

In [None]:
nameList = []

for x in df_NoMedia["Tags"]:
    for y in x:
        if y[1] == "PER":
            for z in str(y[0]).split(" "):
                if z.lower() in (maleNames + femaleNames + includeNames) and z.lower() not in excludeNames:
                    nameList.append(z.capitalize())

In [None]:
import collections
counter = collections.Counter(nameList)

In [None]:
counter_sorted = sorted(counter.items(), reverse=True, key=lambda item: item[1])
df_Names_sorted = pd.DataFrame(counter_sorted, columns=["Word","Count"])

In [None]:
fig = px.bar(df_Names_sorted.head(30), x='Count', y='Word', color='Count', orientation="h", height=1000)
fig.update_layout(
    yaxis=dict(autorange="reversed")
)
fig.show()

# The Most used Emojis Are:

In [None]:
import emoji
from collections import Counter
import re as regex
import demoji

test2 = list()

for x in df["Message"]:
    y = demoji.findall(x)
    if y != {}: 
        for z in y.keys():
            test2.append([z] * x.count(z))
            
flat_list = [item for sublist in test2 for item in sublist]

In [None]:
sortedCounts = sorted(Counter(flat_list).items(), reverse=True, key=lambda item: item[1])
markdown = "| Emoji | Description | Amount |" + "\n" + "| --- | --- | --- |"
for x in sortedCounts[:10]:
    markdown = markdown + "\n" + "" + str(x[0]) + "|" + str(demoji.findall(x[0]).get(x[0])) + "|" + str(x[1])
md(markdown)

# At which hour of the day were the most messages sent

In [None]:
df['Hour'] = (df['Time'].dropna()).apply(lambda x : x.split(':')[0])
times = pd.DataFrame(df["Hour"].value_counts())
times.rename(columns={'Hour':'Count'}, inplace=True)
times["Hour"] = times.index
times["Normalized"] = times["Count"]/(times["Count"].sum()) * 100

In [None]:
md(f"The most Messages were sent at **{int(times[times.Count == times.max()['Count']]['Hour'])}** o'clock with **{round(times.max()['Normalized'], 2)}** Percent and **{int(times.max()['Count'])}** Messages")

In [None]:
times = times.sort_values(by=['Hour'])
fig = px.line(times, x='Hour', y='Normalized', labels={'Normalized' : 'Percentage of Messages Sent', 'Hour' : 'Hour of the Day'})
fig.show()

# And how much You wrote at any given Day

In [None]:
dates = pd.DataFrame(df["Date"].value_counts())
dates.head()
dates.rename(columns={'Date':'Count'}, 
                 inplace=True)
dates["Date"] = dates.index
dates = dates.sort_values(by='Date')

In [None]:
md(f"The most Messages were sent at **{dates[dates.Count == dates.max()['Count']]['Date'][0].strftime('%d.%b.%Y')}** with **{dates[dates.Count == dates.max()['Count']]['Count'][0]}** Messages" )

In [None]:
md(f"\n While your average was **{int(round(dates['Count'].mean()))}** Messages per Day")

In [None]:
md(f"\n And your Median was **{int(round(dates['Count'].median()))}** Messages per Day")

In [None]:
fig = px.line(dates, x="Date", y="Count", title='Messages Sent per Date')
fig.show()

# Who sends the most Messages

In [None]:
authorcounts = df["Author"].value_counts().rename_axis('Names').reset_index(name='Values')
authorcounts.head()
fig = go.Figure(data=[go.Pie(labels=authorcounts["Names"], values=authorcounts["Values"] ,hole=.3)])
fig.show()

## Who sends the most media

In [None]:
mediaCounts = df[df.Message == "<Medien ausgeschlossen>"]["Author"].value_counts().rename_axis('Names').reset_index(name='Values')
fig = go.Figure(data=[go.Pie(labels=mediaCounts["Names"], values=mediaCounts["Values"] ,hole=.3)])
fig.show()

## Who sends the longest Messages

In [None]:
df["Message"] = df["Message"].replace("<Medien ausgeschlossen>" , None)
df = df.dropna()

In [None]:
md(f"The mean length of a message is **{round(df['Length'].mean(), 2)}** characters while the median is **{df['Length'].median()}**")

In [None]:
fig = px.bar(df.groupby(["Author"])["Length"].mean().reset_index(), x="Author", y="Length", color="Length", labels={'Length' : 'Mean Character Length'})
fig.show()

# And who answers who

In [None]:
df['AuthorID'] = pd.factorize(df.Author)[0]
df["Answerer"] = df.Author.shift(-1)
df['AnswererID'] = df.AuthorID.shift(-1)
fromTo = pd.DataFrame()
fromTo = df.groupby(['AuthorID','AnswererID']).size().reset_index().rename(columns={0:'count'})

In [None]:
labels = []
for x in ([df["Author"].unique()] + [df["Author"].unique()]):
    for y in x:
        labels.append(y)

In [None]:
import numpy as np

authorAmount = len(df["Author"].unique())
snakeyGraph = True

if authorAmount > 9:
    snakeyGraph = False
    print("Sadly the Graph is not supported for Groups with more than 9 Members")

In [None]:
if snakeyGraph:
    conditionsLong = [
        (fromTo["AnswererID"] == 0),
        (fromTo['AnswererID'] == 1),
        (fromTo['AnswererID'] == 2),
        (fromTo['AnswererID'] == 3),
        (fromTo["AnswererID"] == 4),
        (fromTo['AnswererID'] == 5),
        (fromTo['AnswererID'] == 6),
        (fromTo['AnswererID'] == 7),
        (fromTo["AnswererID"] == 8),
        (fromTo['AnswererID'] == 9)]

    conditions = conditionsLong[:authorAmount]

    colors = ['rgb(31, 119, 180)', 'rgb(255, 127, 14)',
                           'rgb(44, 160, 44)', 'rgb(214, 39, 40)',
                           'rgb(148, 103, 189)', 'rgb(140, 86, 75)',
                           'rgb(227, 119, 194)', 'rgb(127, 127, 127)',
                           'rgb(188, 189, 34)', 'rgb(23, 190, 207)']
    choices = colors[:authorAmount]

    fromTo['color'] = np.select(conditions, choices, default='green')

In [None]:
if snakeyGraph:

    import plotly.graph_objects as go

    fig = go.Figure(data=[go.Sankey(
        node = dict(
          pad = 15,
          thickness = 20,
          line = dict(color = "black", width = 0.5),
          label = labels,
          color = choices),
        link = dict(
          source = fromTo["AuthorID"],
          target = fromTo["AnswererID"],
          value = fromTo["count"],
          color = fromTo["color"]
      ))])

    fig.update_layout(title_text="Message Flow", font_size=10)
    fig.show()