In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import urllib
import emoji
import re
import spacy
from wordcloud import WordCloud, STOPWORDS
from nltk import ngrams
%matplotlib inline

In [None]:
def load_chat(path_to_chat: str) -> list:
    with open(path_to_chat, 'r', encoding = 'UTF-8') as infile:
        # read whole file
        chat = infile.read()
        # split lines at newline (not an actual CRLF!)
        chat = chat.split('\n') #type(list)
        # remove \u200e lines since they are only contained in auto generated info messages from Whatsapp
        chat = [line for line in chat if r'\u200e' not in repr(line)]
        chat = [line for line in chat if "Sicherheitsnummer" not in line]
        # delete last entry because it's the file's end and contains only "\n"
        del chat[-1]
        
        return chat #type(list)

In [None]:
class UnknownChatFormat(Exception):
    '''Raised when chat format or device could not be detected'''
    def __init__(self, message="Unknown chat format: could not detect if android or iOS was used"):
        self.message = message
        super().__init__(self.message)

In [None]:
def determine_chat_format(chat: list) -> str:
    '''find out what device was used to export the chat file.
    Android format is different from iOS in terms of timestamp.
    iOS: "[dd.mm.yy, HH:MM:SS]"
    Android: "dd.mm.yy, HH:MM:SS"
    Basicly the same but without the brackets'''
    
    # take 20 random messages out of the chat and check their format
    ### ANDROID: 0; iOS: 1 ###
    
    result_list = []
    
    for n in np.random.randint(0, high=len(chat), size=20):
        if str(chat[n]).startswith('['):     
            result_list.append(1)
        elif str(chat[n])[0].isdigit():
            result_list.append(0)
        else:
            continue
    try:
        result = sum(result_list) / len(result_list)
    except ZeroDivisionError as exp:
        raise UnknownChatFormat

    if result > 0.9:
        return "ios"
    elif result < 0.1:
        return "android"
    else:
        raise UnknownChatFormat()

In [None]:
def check_message_integrity_ios(chat:list) -> list:
    '''check if line is a valid message with timestamp, sender and message
    sometimes lines are cut of by CRLF respectively \n in this case.
    Put split messages back together in this case.'''

    ## if it's an iOS chat
    # check if all lines start with '[' and get indices of split messages
    split_messages_idx = [idx for idx, line in enumerate(chat) if not line.startswith('[')]

    # make sure indices are sorted so chat list indices don't get fucked up when deleting indices
    for idx in sorted(split_messages_idx, reverse=True):
        # iterate over split messages and merge them with the message send before
        merged_message = chat[idx-1] + ' ' + chat[idx] #type(str)
        chat[idx-1] = merged_message
        
        # delete split messages by index after merging
        del chat[idx]   
    
    return chat #type(list)

In [None]:
def check_message_integrity_android(chat:list) -> list:
    '''check if line is a valid message with timestamp, sender and message
    sometimes lines are cut of by CRLF respectively \n in this case.
    Put split messages back together in this case.'''

    ## if it's an android chat
    # check if all lines start with a timestamp and get indices of split messages
    split_messages_idx = []
    for idx, line in enumerate(chat):
        try:
            datetime.datetime.strptime(line[:15], '%d.%m.%y, %H:%M')
        except ValueError:
            split_messages_idx.append(idx)

    # make sure indices are sorted so chat list indices don't get fucked up when deleting indices
    for idx in sorted(split_messages_idx, reverse=True):
        # iterate over split messages and merge them with the message send before
        merged_message = chat[idx-1] + ' ' + chat[idx] #type(str)
        chat[idx-1] = merged_message
        
        # delete split messages by index after merging
        del chat[idx]
    
    return chat #type(list)

In [None]:
def parse_date_ios(line: str) -> datetime.datetime:
    # split every line of chat between the first brackets
    date_string = line.split('[')[1].split(']')[0]
    # create datetime obj from remaining date format dd.mm.yy, HH:MM:SS
    message_date = datetime.datetime.strptime(date_string, '%d.%m.%y, %H:%M:%S') # type(datetime.datetime)
    return message_date

In [None]:
def parse_date_android(line: str) -> datetime.datetime:
    # split every line of chat after "-"
    date_string = line.split('-')[0].strip()
    # create datetime obj from remaining date format dd.mm.yy, HH:MM:SS
    message_date = datetime.datetime.strptime(date_string, '%d.%m.%y, %H:%M') # type(datetime.datetime)
    return message_date

In [None]:
def get_message_sender_android(line: str) -> str:
    # split string between "-" and ":" to get sender of the message
    return line.split('-')[1].split(':')[0].strip()

In [None]:
def get_message_sender_ios(line: str) -> str:
    # split string between timestamp and ":" to get sender of the message
    return line.split(']')[1].split(':')[0].strip()

In [None]:
def chop_message_ios(line: str) -> str:
    # chop timestamp and sender to only get raw text
    # therefore split at 3rd ':', which indicates message start after sender tag
    return line.split(':', 3)[3].strip() #type(str)

In [None]:
def chop_message_android(line: str) -> str:
    # chop timestamp and sender to only get raw text
    # therefore split at 2nd ':', which indicates message start after sender tag
    return line.split(':', 2)[2].strip() #type(str)

In [None]:
def guess_christophers_alias(chat_df: pd.core.frame.DataFrame) -> str:
    senders = chat_df['sender'].unique()
    for elem in senders:
        if "christopher" in elem.lower() or "chris" in elem.lower():
            return elem #type(str)

In [None]:
def parse_emojis(message: str) -> (list, str):
    import emoji
    ### GET EMOJI LIST ###
    
    # decode all emojis into "demojized" syntax (e.g. :grinning_face:)
    demojized_message = emoji.demojize(message)
    # regex the line and look for emojis
    demojized_message = re.findall(r'(:[^:]*:)', demojized_message)
    # encode into emojis again
    emojis_in_message = [emoji.emojize(x, use_aliases=True) for x in demojized_message]
    
    
    ### remove emojis from message ###
    
    # we need to do it all over again because somehow emoji fucks up string obj
    # do it over again and delete emojis from message
    demojized_message = emoji.demojize(message)
    raw_emojis = re.findall(r'(:[^:]*:)', demojized_message)
    # delete all emojis from message
    for emoji in raw_emojis:
        demojized_message = demojized_message.replace(emoji, '')
    
    return emojis_in_message, demojized_message

In [None]:
def extract_date_from_timestamp(input_timestamp: datetime.datetime) -> datetime.date:
    return input_timestamp.date()

In [None]:
def extract_time_from_timestamp(input_timestamp) -> datetime.time:
    return input_timestamp.time()

In [None]:
def calc_time_diff(chat_df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    
    '''
    iterate over chat_df and annotate time between messages
    
    '''
    
    for idx, row in chat_df.iterrows():

        if idx == 0:
            # first message has no time diff
            continue

        timestamp_b = row['timestamp']
        timestamp_a = chat_df.iloc[idx-1]['timestamp']
        time_delta = timestamp_b - timestamp_a
        chat_df.at[idx, 'time_diff'] = time_delta
                
    return chat_df

In [None]:
def annotate_message_types(chat_df: pd.core.frame.DataFrame, 
                      answer_time_threshold: datetime.timedelta=datetime.timedelta(days=2)
                     ) -> pd.core.frame.DataFrame:
    
    '''iterate over chat_df and check if it is an answer, follow up or new initiation
    
    ################ EXAMPLE ####################
    ## [MSG_A] day1 10:00 sender_a: HI!
    ## [MSG B] day1 10:05 sender_b: Hi!
    ## --> then MSG B is an answer
    
    ## [MSG_A] day1 10:00 sender_a: bye!
    ## [MSG B] day9 12:05 sender_b: long time no see!
    ## --> then MSG B is a new initiation
    
    ## [MSG_A] day1 10:00 sender_a: hello?!?!?!
    ## [MSG B] day4 01:05 sender_a: hellooooooo?!
    ## --> then MSG B is also a new initiation
    
    ## [MSG_A] day1 10:00 sender_a: can you bring me something from the store?
    ## [MSG B] day1 12:05 sender_a: some milk and icecream!
    ## --> then MSG B is a follow up
    #############################################
    '''
    
    for idx, row in chat_df.iterrows():

        if idx == 0:
            # first message is always an initiation and has no time diff or answer time
            chat_df.at[idx, 'message_type'] = "initiation"
            chat_df.at[idx, 'is_media'] = False
            continue
        
        # check if message is a picture or video
        if row['raw_message'] == "<Medien ausgeschlossen>":
            chat_df.at[idx, 'is_media'] = True
            chat_df.at[idx, 'raw_message'] = ""
            chat_df.at[idx, 'message'] = ""
        else:
            chat_df.at[idx, 'is_media'] = False
        
        
        sender_b = row['sender']
        sender_a = chat_df.iloc[idx-1]['sender']
        time_delta = row['time_diff'] #type(datetime.timedelta)

        if sender_b == sender_a:
            if time_delta < answer_time_threshold:
                # if sender_a and sender_b are the same and time between the two messages is < answer_time_threshold,
                # then it's a "follow up"
                chat_df.at[idx, 'message_type'] = "follow_up"

            else:
                # if between messages is > answer_time_threshold, then it's a new initiation of the conversation
                # the recipient didn't respond :(
                chat_df.at[idx, 'message_type'] = "initiation"


        if sender_a != sender_b:
            if time_delta < answer_time_threshold:
                # if sender_a and sender_b are NOT the same and time between the two messages < answer_time_threshold,
                # then it's an answer
                chat_df.at[idx, 'message_type'] = "answer"
                chat_df.at[idx, 'answer_time_seconds'] = time_delta.seconds

            else:
                # if time is > answer_time_threshold then it's a new initiation (or maybe just a sorry? ¯\_(ツ)_/¯)
                chat_df.at[idx, 'message_type'] = "initiation"

                
    return chat_df

In [None]:
def get_ngrams(text, n):
    '''generate a list of all ngrams for a given input spacy doc'''
    n_grams = ngrams(text, n)
    return [n for n in n_grams]

In [None]:
def annotate_questions(spacy_doc):
    if "?" in spacy_doc.text:
        return True
    else:
        return False

In [None]:
def check_for_url(msg):
    if "www" in msg or "http" in msg:
        return True
    else:
        return False

In [None]:
def parse_chat(path_to_chat: str) -> pd.core.frame.DataFrame:
    start_time = datetime.datetime.now()
    chat = load_chat(path_to_chat) #type(list)
    chat_format = determine_chat_format(chat) #type(str)
    
    if chat_format.lower() == "android":
        chat = check_message_integrity_android(chat) #type(list)
    elif chat_format.lower() == "ios":
        chat = check_message_integrity_ios(chat) #type(list)
    
    raw_message_list = []
    messages = []
    sender_list = []
    timestamps = []
    emoji_list = []
    
    if chat_format.lower() == "android":
        # iterate over every line (=message) and get attributes
        for idx, message in enumerate(chat):
            sender_list.append(get_message_sender_android(message))
            timestamps.append(parse_date_android(message))
            messages.append(chop_message_android(message))
            
    elif chat_format.lower() == "ios":
        # iterate over every line (=message) and get attributes
        for message in chat:
            sender_list.append(get_message_sender_ios(message))
            timestamps.append(parse_date_ios(message))
            messages.append(chop_message_ios(message))
        
    # iterate over message and parse / extract emojis
    for message in messages:
        emojis_, raw_message = parse_emojis(message)
        emoji_list.append(emojis_)
        raw_message_list.append(raw_message)
        
    assert len(sender_list) == len(timestamps) == len(messages) == len(emoji_list)
    
    # create df from the lists
    # sort by timestamp because whatsapp logs messages by received timestamp and not by send timestamp
    chat_df = pd.DataFrame({"sender":sender_list,
                            "timestamp": timestamps,
                            "message": messages,
                            "raw_message": raw_message_list,
                            "emojis": emoji_list}).sort_values('timestamp').reset_index(drop=True)
    
    # further timestamp extractions
    chat_df['date'] = chat_df['timestamp'].apply(lambda x: extract_date_from_timestamp(x))
    chat_df['time'] = chat_df['timestamp'].apply(lambda x: extract_time_from_timestamp(x))
    chat_df['weekday'] = chat_df['timestamp'].apply(lambda x: x.strftime('%A'))
    chat_df['hour'] = chat_df['time'].apply(lambda x: x.hour)
    
    # annotations for message type and answer time
    chat_df = calc_time_diff(chat_df)
    chat_df = annotate_message_types(chat_df)
    
    # parse urls and get domains
    chat_df['has_url'] = chat_df['raw_message'].apply(check_for_url)
    chat_df['url_domain'] = chat_df[chat_df['has_url'] == True]['raw_message'].apply(lambda x: urllib.parse.urlparse(x).netloc)
    
    # initialize NLP model
    nlp = spacy.load("de_core_news_sm", exclude=['senter', 'sentencizer', 'attribute_ruler', 'parser', 'morphologizer', 'ner'])
    nlp.remove_pipe("ner")
    nlp.remove_pipe("parser")
    
    # add stopwords by hand
    customize_stop_words = ["<", "Medium", "medium","is","mal", "hab","halt","Hab",">","ausschließen", "?", ",", ".", "ne",
                           "n", "Okay", "okay", "nee", "de", "Medien", "medien", "net", "Nee", "irgendwie", "au", "Is", "nix", "Nix", '„', '“',
                           "haja", "Haja", "grade", "echt", "voll", "glaub", "isses", "Isses", "warn", "grad", "Grad", "Gerade", "gerade"]

    for w in customize_stop_words:
        nlp.vocab[w].is_stop = True
    
    # TODO: increase performance by applying all processing with only one big func and iterate only once over df
    # NLP parsing
    chat_df['spacy_doc'] = chat_df['raw_message'].apply(lambda x: nlp(x.lower()))
    chat_df['nouns'] = chat_df['spacy_doc'].apply(lambda doc: [token.text for token in doc if token.pos_ == "NOUN" and token.is_stop == False])
    chat_df['verbs'] = chat_df['spacy_doc'].apply(lambda doc: [token.text for token in doc if token.pos_ == "VERB" and token.is_stop == False])
    chat_df['msg_lemmas_stop_removed'] = chat_df['spacy_doc'].apply(lambda doc: [token.lemma_ for token in doc if token.is_stop == False])
    chat_df['is_question'] = chat_df['spacy_doc'].apply(annotate_questions)
    
    # generate ngrams
    chat_df['trigrams'] = chat_df['msg_lemmas_stop_removed'].apply(get_ngrams, n=3)
    chat_df['bigrams'] = chat_df['msg_lemmas_stop_removed'].apply(get_ngrams, n=2)
    
    print(f"took {datetime.datetime.now() - start_time}")
    return chat_df

In [None]:
chat_df = parse_chat('chat_charlotte_fabi.txt')

In [None]:
chat_df

In [None]:
#christopher_alias = guess_christophers_alias(chat_df)

In [None]:
#christopher_only = chat_df[chat_df['sender'] == christopher_alias]

# NLP stuff

In [None]:
def get_most_used_words(chat_df):
    words = [token for msg in chat_df['msg_lemmas_stop_removed'] for token in msg]
    return pd.Series(words).value_counts()

In [None]:
def get_most_used_nouns(chat_df):
    words = [token for msg in chat_df['nouns'] for token in msg]
    return pd.Series(words).value_counts()

In [None]:
def get_most_used_verbs(chat_df):
    words = [token for msg in chat_df['verbs'] for token in msg]
    return pd.Series(words).value_counts().head(50)

In [None]:
get_most_used_verbs(chat_df)

In [None]:
get_most_used_nouns(chat_df[chat_df['sender'] == 'Fabi'])

In [None]:
def generate_wordcloud(words_series: pd.core.series.Series):
    # Generate word cloud
    wordcloud = WordCloud(width = 2000, height = 1000, background_color='black').generate_from_frequencies(words_series)
    # Plot
    fig, ax = plt.subplots(figsize=(16,8), dpi=400)
    plt.axis("off")
    plt.imshow(wordcloud, interpolation="bilinear")

In [None]:
def get_most_used_bigrams(chat_df):
    '''get most frequent bigrams for a input chat'''
    bigrams = pd.Series([x for _list in chat_df['bigrams'] for x in _list])
    return bigrams.value_counts()

In [None]:
def get_most_used_trigrams(chat_df):
    '''get most frequent bigrams for a input chat'''
    trigrams = pd.Series([x for _list in chat_df['trigrams'] for x in _list])
    return trigrams.value_counts()

In [None]:
get_most_used_trigrams(chat_df[chat_df['sender']=='Charlotte'])

In [None]:
def get_most_used_curse_words(chat_df):
    with open('schimpfwortliste.txt', 'r') as infile:
        curse_words = infile.readlines()
        for idx, word in enumerate(curse_words):
            curse_words[idx] = word.replace('\n', '')

    words = get_most_used_words(chat_df)
    curse_word_usage = [word for word in words.index if word in curse_words]
    return words[curse_word_usage]

In [None]:
generate_wordcloud(get_most_used_curse_words(chat_df[chat_df['sender'] == 'Fabi']))

# Basic statistics and plots

In [None]:
chat_df['url_domain'].value_counts()

In [None]:
def get_mean_messages_per_day(chat_df: pd.core.frame.DataFrame):
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime() 
    # -> np.array[datetime.datetime, datetime.datetime, ....], has to be converted into datetime.date!

    # iterate over all dates and count how many messages were sent
    num_chat_per_day = [chat_df[chat_df['date'] == date.date()].count().values[0] for date in timeframe]
    
    return np.array(num_chat_per_day).mean()

In [None]:
def plot_daily_activity(chat_df: pd.core.frame.DataFrame):
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()

    # iterate over all dates and count how many messages were sent
    num_chat_per_day = [chat_df[chat_df['date'] == date.date()].count().values[0] for date in timeframe]

    fig, ax = plt.subplots(figsize=(16,8), dpi=400)
    # plot chat activity by day
    ax.plot(timeframe,num_chat_per_day, label='Anzahl Nachrichten pro Tag')
    # plot mean value
    ax.plot(timeframe, 
            np.full(timeframe.shape, get_mean_messages_per_day(chat_df)),
            '--',
            linewidth=3,
            label=f'Durchschnitt: {get_mean_messages_per_day(chat_df):.3f}')

    # add axis labels and legend
    ax.set_ylabel('Anzahl Nachrichten')
    ax.set_xlabel('Datum')
    ax.set_title('Nachrichten im Zeitverlauf')
    ax.legend(title_fontsize='large')

In [None]:
plot_daily_activity(chat_df)

In [None]:
get_mean_messages_per_day(chat_df)

In [None]:
def plot_time_to_answer_weekdays(chat_df: pd.core.frame.DataFrame):
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['answer_time_seconds']/60,
            x=chat_df['sender'],
            hue=chat_df['weekday'],
            hue_order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
            showfliers=False)
    ax1.set_title('Antwortzeit in Minuten')
    ax1.set_ylabel('Antwortzeit in Minuten')
    ax1.set_xlabel('Sender')
    ax1.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)
    plt.show()

In [None]:
plot_time_to_answer_weekdays(chat_df)

In [None]:
def plot_time_to_answer(chat_df: pd.core.frame.DataFrame):
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['answer_time_seconds']/60,
            x=chat_df['sender'],
            showfliers=False)
    ax1.set_title('Antwortzeit in Minuten')
    ax1.set_ylabel('Antwortzeit in Minuten')
    ax1.set_xlabel('Sender')
    plt.show()

In [None]:
plot_time_to_answer(chat_df)

In [None]:
def get_mean_time_to_answer(chat_df: pd.core.frame.DataFrame):
    return chat_df['answer_time'].mean()

In [None]:
def longest_time_of_no_chatting(chat_df):
    # get max(time delta) of all messages
    return chat_df['time_diff'].max()

In [None]:
longest_time_of_no_chatting(chat_df)

In [None]:
def get_initiator_percentage(chat_df):
    return chat_df[chat_df['message_type'] == 'initiation']['sender'].value_counts(normalize=True)

In [None]:
get_initiator_percentage(chat_df)

In [None]:
def get_most_used_emojis(chat_df: pd.core.frame.DataFrame):
    emojis = [y for msg in chat_df['emojis'] for y in msg]
    return pd.Series(emojis).value_counts().head(5)

In [None]:
def add_value_labels(ax, spacing=5):
    """Add labels to the end of each bar in a bar chart.

    Arguments:
        ax (matplotlib.axes.Axes): The matplotlib object containing the axes
            of the plot to annotate.
        spacing (int): The distance between the labels and the bars.
    """

    # For each bar: Place a label
    for rect in ax.patches:
        # Get X and Y placement of label from rect.
        y_value = rect.get_height()
        x_value = rect.get_x() + rect.get_width() / 2

        # Number of points between bar and label. Change to your liking.
        space = spacing
        # Vertical alignment for positive values
        va = 'bottom'

        # If value of bar is negative: Place label below bar
        if y_value < 0:
            # Invert space to place label below
            space *= -1
            # Vertically align label at top
            va = 'top'

        # Use Y value as label and format number with one decimal place
        label = "{:.0f}".format(y_value)

        # Create annotation
        ax.annotate(
            label,
            (x_value, y_value),
            xytext=(0, space),
            textcoords="offset points",
            ha='center',
            va=va,
            fontsize=16)

In [None]:
def plot_most_used_emojis(chat_df):
    fig, ax = plt.subplots(chat_df['sender'].nunique(),1, figsize=(14,8))
    fig.suptitle('Meistbenutzte Emojis', fontsize=22)
    plt.subplots_adjust(hspace = 0.7)
    for idx, sender in enumerate(chat_df['sender'].unique()):
        s = get_most_used_emojis(chat_df[chat_df['sender'] == sender])
        ax[idx].bar(s.index, s.values, label=sender)
        ax[idx].set_xlabel(sender, fontsize=16)
        ax[idx].set_ylabel("Häufigkeit", fontsize=16)
        ax[idx].xaxis.set_tick_params(labelsize=22)
        ax[idx].set_yticks([])
        ax[idx].spines['left'].set_visible(False)
        ax[idx].spines['right'].set_visible(False)
        ax[idx].spines['top'].set_visible(False)
        # Call the function above. All the magic happens there.
        add_value_labels(ax[idx])
    
    plt.show()

In [None]:
plot_most_used_emojis(chat_df)

In [None]:
def get_weekday_activity(chat_df: pd.core.frame.DataFrame, weekday:str):
    '''get total number of sent messages during the time of a given weekday'''
    
    # check if weekday is valid input
    if weekday in chat_df['weekday'].values:
        pass
    else:
        raise ValueError()
        
    # create time range of all 24h of the day
    time_range = pd.date_range("00:00:00", periods=24, freq="1h").time
    
    # iterate over time range and get message count at each hour
    msg_counter = []
    for time_slot in time_range:
        num_msg_per_hour = chat_df[(chat_df['weekday'] == weekday) &
                                   (chat_df['hour'] == time_slot.hour)].count().values[0]
        msg_counter.append((weekday, time_slot.hour, num_msg_per_hour))
        
    return msg_counter

In [None]:
def plot_weekday_activity(chat_df: pd.core.frame.DataFrame):
    '''plot a heatmap with the chat activity per day and hour'''
    
    ## data
    # prepare data
    weekday_df = pd.DataFrame()
    for weekday in chat_df['weekday'].unique():
        cache_df = pd.DataFrame(get_weekday_activity(chat_df, weekday), columns=['weekday', 'hour', 'message_count'])
        weekday_df = weekday_df.append(cache_df)
    
    # get the df into the right form
    weekday_df = weekday_df.set_index(['hour', 'weekday'])
    weekday_df = weekday_df.unstack(fill_value = 0)
    
    # get weekdays into right order
    weekday_df = weekday_df.droplevel(0, axis=1)
    weekday_df = weekday_df.filter(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
    
    ## plot
    # figure
    fig, ax = plt.subplots(figsize=(16, 9), dpi=300)
    # plot heatmap
    sns.heatmap(data = weekday_df, linewidth=0.2, cmap='Blues')

    # yticks
    yticks_labels = ["{time_1:02}:00 - {time_2:02}:00".format(time_1=x, time_2=x+1) for x in range(24)]
    plt.yticks(np.arange(24) + .5, labels=yticks_labels,rotation=0)
    # xticks
    ax.xaxis.tick_top()
    # axis labels
    plt.xlabel('')
    plt.ylabel('Tageszeit')
    # title
    plt.title("Chataktivität im Tagesverlauf")

In [None]:
plot_weekday_activity(chat_df[chat_df['sender'] == 'Charlotte'])

In [None]:
weekday_df = pd.DataFrame()
for weekday in chat_df['weekday'].unique():
    cache_df = pd.DataFrame(get_weekday_activity(chat_df, weekday), columns=['weekday', 'hour', 'message_count'])
    weekday_df = weekday_df.append(cache_df)

# get the df into the right form
weekday_df = weekday_df.set_index(['hour', 'weekday'])
weekday_df = weekday_df.unstack(fill_value = 0)

# get weekdays into right order
weekday_df = weekday_df.droplevel(0, axis=1)
weekday_df = weekday_df.filter(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])

In [None]:
generate_wordcloud(chat_df[chat_df['weekday'] == 'Saturday'])

In [None]:
# search for interesting messages
interesting_messages = pd.DataFrame()
for idx, col in interesting_times.iterrows():
    interesting_messages = interesting_messages.append(chat_df[(chat_df['hour'] == col['hour']) & (chat_df['weekday'] == col['weekday'])])

# to be done:

In [None]:
# solve all problems with .apply() :-/ --> way faster and cleaner

In [None]:
chat_df.columns

In [None]:
chat_df['time_diff'].describe()

In [None]:
# some function that predicts the "one long messages" vs. "multiple short messages in a row" type of sender thing

In [None]:
def get_chat_topic_for_peak_chat_activity():
    pass

# get first and last chat dates and create a range of dates
first_chat_date=chat_df['date'].min()
last_chat_date = chat_df['date'].max()
timeframe = pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()

# iterate over all dates and count how many messages were sent
num_chat_per_day = [chat_df[chat_df['date'] == date.date()].count().values[0] for date in timeframe]

In [None]:
def plot_media_sent():
    fig, ax = plt.figure(figsize=(16,8), dpi=300)
    

In [None]:
chat_df[chat_df['is_media'] == True]['sender'].value_counts()