In [None]:
import pandas as pd
import re
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from typing import List, Tuple
from wordcloud import WordCloud, STOPWORDS
from nltk import ngrams
from scipy.signal import find_peaks
import whatsapp_parser
%matplotlib inline

In [None]:
chat = whatsapp_parser.Chat('beispiel_chats/spriddis_chat.txt', True, 'ger')

In [None]:
chat.parse_chat()

In [None]:
chat.chat_raw[0]

In [None]:
for idx, msg in enumerate(chat.chat_raw):
    if msg.count(':') < 3:
        del chat.chat_raw[idx]

In [None]:
for msg in chat.chat_raw:
    if msg.count(':') < 3:
        print(msg)

In [None]:
chat.chat_raw = [msg for msg in chat.chat_raw if msg.count(':') > 2]

### Print Basic Information

In [None]:
print(f"The chat is between to following senders:")
for idx, sender in enumerate(chat.senders):
      print(f"{idx+1}. {sender}")
print(f"\n{len(chat.chat_df)} messages were exchanged")
print(f"\nThe chat begun on {chat.chat_df['date'].min()} and ends currently on {chat.chat_df['date'].max()}")

In [None]:
# plot number of messages by sender
chat.chat_df['sender'].value_counts().plot(kind='bar')

# NLP stuff

In [None]:
def get_most_used_lemmas(chat_df: pd.DataFrame) -> pd.Series:
    lemmas = [lemma for msg in chat_df[chat_df['is_media'] == False]['lemmas'] for lemma in msg if lemma.strip() != ""]
    return pd.Series(lemmas).value_counts()

In [None]:
def get_most_used_words(chat_df: pd.DataFrame) -> pd.Series:
    words = [word for msg in chat_df[chat_df['is_media'] == False]['words'] for word in msg if word.strip() != ""]
    return pd.Series(words).value_counts()

In [None]:
def get_most_used_nouns(chat_df: pd.DataFrame) -> pd.Series:
    nouns = [noun for msg in chat_df[chat_df['is_media'] == False]['nouns'] for noun in msg if noun.strip() != ""]
    return pd.Series(nouns).value_counts()

In [None]:
def get_most_used_verbs(chat_df: pd.DataFrame) -> pd.Series:
    verbs = [verb for msg in chat_df[chat_df['is_media'] == False]['verbs'] for verb in msg if verb.strip() != ""]
    return pd.Series(verbs).value_counts()

In [None]:
def generate_wordcloud(words_series: pd.Series):
    # Generate word cloud
    wordcloud = WordCloud(width = 2000, height = 1000, background_color='black').generate_from_frequencies(words_series)
    # Plot
    fig, ax = plt.subplots(figsize=(16,8), dpi=400)
    plt.axis("off")
    plt.imshow(wordcloud, interpolation="bilinear")

In [None]:
def get_most_used_bigrams(chat_df: pd.core.series.Series):
    '''get most frequent bigrams for a input chat'''
    bigrams = pd.Series([bigram for sublist in chat_df[chat_df['is_media'] == False]['bigrams'] for bigram in sublist])
    return bigrams.value_counts()

In [None]:
def get_most_used_trigrams(chat_df: pd.DataFrame) -> pd.Series:
    '''get most frequent bigrams for a input chat'''
    trigrams = pd.Series([x for _list in chat_df[chat_df['is_media'] == False]['trigrams'] for x in _list])
    return trigrams.value_counts()

In [None]:
get_most_used_words(chat.chat_df)[:20]

In [None]:
trigrams = get_most_used_trigrams(chat.chat_df)

In [None]:
bigrams = get_most_used_bigrams(chat.chat_df)

In [None]:
generate_wordcloud(get_most_used_lemmas(chat.chat_df))

In [None]:
get_most_used_words(chat.chat_df).head(10).sort_values().plot(kind='barh')

In [None]:
bigrams.head(10).sort_values().plot(kind='barh')

In [None]:
def get_most_used_curse_words(chat_df):
    with open('curse_words_ger.txt', 'r') as infile:
        curse_words = infile.readlines()
        for idx, word in enumerate(curse_words):
            curse_words[idx] = word.replace('\n', '')

    words = get_most_used_words(chat_df)
    curse_word_usage = [word for word in words.index if word in curse_words]
    return words[curse_word_usage]

In [None]:
generate_wordcloud(get_most_used_curse_words(chat.chat_df))

# Basic statistics and plots

In [None]:
def plot_curse_word_usage(chat_df: pd.core.frame.DataFrame):
    curse_word_sender_list = []
    total_msg_counts = chat_df['sender'].value_counts()

    for sender in chat_df['sender'].unique():
        curse_word_sum = get_most_used_curse_words(chat_df[chat_df['sender'] == sender]).sum()
        total_messages_sum = total_msg_counts[sender]
        curse_word_sender_list.append((sender, curse_word_sum, total_messages_sum))
        
    curse_df = pd.DataFrame(curse_word_sender_list, columns=['sender', 'curse_word_count', 'total_msg_count'])
    curse_df['curse_rate'] = curse_df['curse_word_count'] / curse_df['total_msg_count']
    
    fig, ax = plt.subplots(1, 2,sharey=True, figsize=(16,8))
    plt.subplots_adjust(wspace = 0.05)
    fig.suptitle('Who is the biggest pig in the chat?')
    
    ax[0].set_title('Curse word count')
    ax[1].set_title('Curse rates') 
    
    curse_word_count = sns.barplot(ax=ax[0],
                                   data=curse_df,
                                   x='curse_word_count',
                                   y='sender',
                                   order=total_msg_counts[:8].index)
    
    curse_word_rate = sns.barplot(ax=ax[1],
                                  data=curse_df,
                                  x='curse_rate',
                                  y='sender',
                                  order=total_msg_counts[:8].index)
    curse_word_rate.set(ylabel=None)
    
    plt.show()
    
    

In [None]:
plot_curse_word_usage(chat.chat_df)

In [None]:
def plot_url_usage(chat_df: pd.core.frame.DataFrame):
    fig = plt.figure(figsize=(16,8), dpi=400)
    g = sns.countplot(data=chat_df,
                      x='url_domain',
                      order=chat_df['url_domain'].value_counts()[:15].index,
                      hue='sender',
                      hue_order=chat_df['sender'].value_counts()[:8].index,
                      )
    g.set_xticklabels(g.get_xticklabels(),rotation=30)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
plot_url_usage(chat.chat_df)

In [None]:
def get_mean_messages_per_day(chat_df: pd.DataFrame) -> float:
    # get first and last chat dates and create a range of dates
    first_chat_date = chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.Series(pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()).value_counts(sort=False) 

    # value counts for each day that messages were sent
    num_chat_per_day = chat_df['date'].value_counts(sort=False)
    
    # merge both series and subtract 1 because timeframe == 1 for every day
    merged_value_counts = num_chat_per_day + timeframe - 1
    merged_value_counts = merged_value_counts.fillna(0)
    
    return np.array(merged_value_counts).mean()

In [None]:
def plot_daily_activity(chat_df: pd.DataFrame):
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.Series(pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()).value_counts(sort=False)

    # value counts for each day that messages were sent
    num_chat_per_day = chat_df['date'].value_counts(sort=False)
    
    # merge both series and subtract 1 because value counts of timeframe == 1 for every day
    merged_value_counts = num_chat_per_day + timeframe - 1
    merged_value_counts = merged_value_counts.fillna(0)
    
    fig, ax = plt.subplots(figsize=(16,8), dpi=400)
    # plot chat activity by day
    ax.plot(merged_value_counts, label='Number of messages per day')
    # plot mean value
    ax.plot(merged_value_counts.index,
            # generate dummy array
            np.full(len(merged_value_counts.index), merged_value_counts.mean()),
            '--',
            linewidth=3,
            label=f'Mean: {merged_value_counts.mean():.3f}')
    
    # peak detection and plotting
    peaks = find_peaks(merged_value_counts.values,
                       distance=max(min(len(timeframe)/30, len(timeframe)/7), 1), 
                       threshold=0.9)[0]
    ax.plot(merged_value_counts[peaks].index, merged_value_counts[peaks].values, "xr")
    
    # add axis labels and legend
    ax.set_ylabel('Number of messages')
    ax.set_xlabel('Date')
    ax.set_title('Number Of Messages Per Day')
    ax.legend(title_fontsize='large')

In [None]:
plot_daily_activity(chat.chat_df)

In [None]:
def plot_time_to_reply_weekdays(chat_df: pd.DataFrame):
    """
    Plot time to reply in minutes distributed across weekdays for the top six senders
    """
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['reply_time_seconds']/60,
                      x=chat_df['sender'],
                      order=chat_df['sender'].value_counts().iloc[:8].index,
                      hue=chat_df['weekday'],
                      hue_order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                      showfliers=False)
    ax1.set_title('Time to Reply in Minutes by Day of the Week')
    ax1.set_ylabel('Time to Reply in Minutes')
    ax1.set_xlabel('Sender')
    ax1.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)

    plt.show()

In [None]:
def get_weekday_activity(chat_df: pd.DataFrame, weekday: str) -> List[Tuple[str, int, int]]:
    """
    Get total number of sent messages during the time of a given weekday
    """
    
    # check if weekday is valid input
    if weekday not in chat_df['weekday'].values:
        raise ValueError("Weekday is not in proper format." 
                         "Use one of: ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']")
        
    # create time range of all 24h of the day
    time_range = pd.date_range("00:00:00", periods=24, freq="1h").time
    
    # iterate over time range and get message count at each hour
    msg_counter = []
    for time_slot in time_range:
        num_msg_per_hour = chat_df[(chat_df['weekday'] == weekday) &
                                   (chat_df['hour'] == time_slot.hour)].count().values[0]
        msg_counter.append((weekday, time_slot.hour, num_msg_per_hour))
        
    return msg_counter

In [None]:
def plot_weekday_activity(chat_df: pd.DataFrame):
    """
    plot a heatmap with the average chat activity per day and hour
    """
    
    ## data
    # prepare data
    weekday_df = pd.DataFrame()
    for weekday in chat_df['weekday'].unique():
        cache_df = pd.DataFrame(get_weekday_activity(chat_df, weekday), columns=['weekday', 'hour', 'message_count'])
        weekday_df = weekday_df.append(cache_df)
    
    # get the df into the right form
    weekday_df = weekday_df.set_index(['hour', 'weekday'])
    weekday_df = weekday_df.unstack(fill_value = 0)
    
    # get weekdays into right order
    weekday_df = weekday_df.droplevel(0, axis=1)
    weekday_df = weekday_df.filter(['Monday', 'Tuesday', 'Wednesday', 
                                    'Thursday', 'Friday', 'Saturday', 'Sunday'])
    
    ## plot
    # figure
    fig, ax = plt.subplots(figsize=(16, 9), dpi=300)
    # plot heatmap
    sns.heatmap(data = weekday_df, linewidth=0.2, cmap='Blues')

    # yticks
    yticks_labels = ["{time_1:02}:00 - {time_2:02}:00".format(time_1=hour_of_the_day, time_2=hour_of_the_day+1) 
                     for hour_of_the_day in range(24)]
    
    plt.yticks(np.arange(24) + .5, labels=yticks_labels,rotation=0)
    # xticks
    ax.xaxis.tick_top()
    # axis labels
    plt.xlabel('')
    plt.ylabel('Time Of the Day')
    # title
    plt.title("Chatting Activity during Days of the Week")

In [None]:
plot_weekday_activity(chat.chat_df)

In [None]:
plot_time_to_reply_weekdays(chat.chat_df)

In [None]:
def plot_time_to_reply(chat_df: pd.DataFrame):
    """
    Plot a boxplot of the answer time of the top eight senders of the chat 
    """
    median_answer_times = chat_df[chat_df['sender'].isin(chat_df['sender']\
                                                         .value_counts()[:8].index)]\
                                                         .groupby('sender')['reply_time_seconds'].median()/60
        
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['reply_time_seconds']/60,
            x=chat_df['sender'],
            order=chat_df['sender'].value_counts()[:8].index.sort_values(),
            showfliers=False)
    
    for xtick in ax1.get_xticks():
        ax1.text(xtick, median_answer_times[xtick] + 0.1, np.around(median_answer_times[xtick], 3), 
                     horizontalalignment='center',size='large',color='black',weight='semibold')
    
    ax1.set_title('Time to Reply in Minutes')
    ax1.set_ylabel('Time to Reply in Minutes')
    ax1.set_xlabel('Sender')
    plt.show()

In [None]:
plot_time_to_reply(chat.chat_df)

In [None]:
def longest_time_of_no_chatting(chat_df: pd.DataFrame) -> datetime.timedelta:
    # get max(time delta) of all messages
    return chat_df['time_diff'].max()

In [None]:
longest_time_of_no_chatting(chat.chat_df)

In [None]:
def get_initiator_percentage(chat_df: pd.DataFrame) -> pd.Series:
    return chat_df[chat_df['message_type'] == 'initiation']['sender'].value_counts(normalize=True)

In [None]:
get_initiator_percentage(chat.chat_df)

In [None]:
def get_most_used_emojis(chat_df: pd.DataFrame) -> pd.Series:
    emojis = [emoji for sublist in chat_df['emojis'] for emoji in sublist]
    return pd.Series(emojis).value_counts()

In [None]:
get_most_used_emojis(chat.chat_df)

In [None]:
def plot_emoji_usage(chat_df: pd.DataFrame):
    """
    Takes chat_df and creates plot with most used emojis
    """
    
    emoji_df = pd.DataFrame()
    for sender in chat_df['sender'].unique():
        cache_df = pd.DataFrame()
        emojis_sender = get_most_used_emojis(chat_df[chat_df['sender'] == sender])
        cache_df = pd.DataFrame({'sender': sender,
                                 'emoji': emojis_sender.index,
                                 'emoji_count': emojis_sender})
        emoji_df = emoji_df.append(cache_df, ignore_index=True).reset_index(drop=True)
    
    plt.figure(figsize=(16,8))
    g = sns.barplot(data=emoji_df,
                    x='emoji',
                    y='emoji_count',
                    order= get_most_used_emojis(chat_df).head(10).index,
                    hue='sender',
                    hue_order=chat_df['sender'].value_counts()[:8].index,
                    )

    g.xaxis.set_tick_params(labelsize=26)
    g.yaxis.set_tick_params(labelsize=16)
    g.set(xlabel=None, ylabel='Emoji Count')
    g.legend(fontsize=16)
    plt.yticks(fontname="Segoe UI Emoji")

In [None]:
plot_emoji_usage(chat.chat_df)

# Chat activity by time

# TODO:
### display and determine topics on peak chat activities

In [None]:
import nltk
import spacy
from gensim import corpora
import gensim
from spacy.lang.de import German

In [None]:
def get_message_peak_dates(chat_df: pd.core.frame.DataFrame):
    """
    Takes chat_df and returns the dates of peaks in messages sent
    """
    
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.Series(pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()).value_counts(sort=False)

    # value counts for each day that messages were sent
    num_chat_per_day = chat_df['date'].value_counts(sort=False)
    
    # merge both series and subtract 1 because value counts of timeframe == 1 for every day
    merged_value_counts = num_chat_per_day + timeframe - 1
    merged_value_counts = merged_value_counts.fillna(0)
    
    # peak detection
    peaks = find_peaks(merged_value_counts.values,
                       distance=max(min(len(timeframe)/30, len(timeframe)/7), 1), 
                       threshold=0.9)[0]
    
    peak_dates = merged_value_counts[peaks].index # type: pandas.core.indexes.datetimes.DatetimeIndex
    # we need to convert them to array of datetime.date
    peak_dates = pd.to_datetime(peak_dates).date
    return peak_dates

In [None]:
peak_dates = get_message_peak_dates(chat.chat_df)

In [None]:
peak_dates

In [None]:
def calc_pmi(chat_df):
    # get bigrams
    bigrams = get_most_used_bigrams(chat_df)
    bigrams_relative = bigrams / len(bigrams)
    
    # get lemmas
    lemmas = get_most_used_lemmas(chat_df)
    lemmas_relative = lemmas / len(lemmas)
    
    # calc pmi
    output_dict = {}
    for bigram in bigrams_relative.index:
        px_y = bigrams_relative[bigram]
        px = lemmas_relative[bigram[0]]
        py = lemmas_relative[bigram[1]]

        pmi = np.log(px_y / (px*py))
        output_dict[str(bigram)] = pmi
    
    pmi_series = pd.Series(output_dict).sort_values(ascending=False)
    
    return pmi_series

In [None]:
all_pmi = calc_pmi(chat.chat_df)

In [None]:
peak_bigrams = get_most_used_bigrams(chat.chat_df[chat.chat_df['date'].isin(peak_dates)])