In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import urllib
import emoji as emoji_util
import re
import spacy
from wordcloud import WordCloud, STOPWORDS
from nltk import ngrams
import chart_studio.plotly as py
import cufflinks as cf
import plotly.express as px
import plotly.graph_objects as go
from scipy.signal import find_peaks
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=False)
cf.go_offline()
%matplotlib inline
import whatsapp_parser

In [None]:
import matplotlib.font_manager
font_files = matplotlib.font_manager.findSystemFonts(fontpaths=None, fontext='ttf')
for font_file in font_files:
    matplotlib.font_manager.fontManager.addfont(font_file)

In [None]:
chat = whatsapp_parser.Chat('beispiel_chats/asd.txt', True, 'ger')

In [None]:
chat.parse_chat()

In [None]:
chat.chat_df

In [None]:
chat.chat_df[chat.chat_df['is_image'] == True]

### Print Basic Information

In [None]:
# TODO: print basic information about the chat

In [None]:
chat.chat_df[chat.chat_df['is_image'] == True]

In [None]:
len(chat.chat_df)

# NLP stuff

In [None]:
def get_most_used_lemmas(chat_df: pd.DataFrame) -> pd.Series:
    lemmas = [lemma for msg in chat_df[chat_df['is_image'] == False]['words'] for lemma in msg]
    return pd.Series(lemmas).value_counts()

In [None]:
def get_most_used_words(chat_df: pd.DataFrame) -> pd.Series:
    words = [word for msg in chat_df[chat_df['is_image'] == False]['words'] for word in msg]
    return pd.Series(words).value_counts()

In [None]:
def get_most_used_nouns(chat_df: pd.DataFrame) -> pd.Series:
    nouns = [noun for msg in chat_df[chat_df['is_image'] == False]['words'] for noun in msg]
    return pd.Series(nouns).value_counts()

In [None]:
def get_most_used_verbs(chat_df: pd.DataFrame) -> pd.Series:
    verbs = [verb for msg in chat_df[chat_df['is_image'] == False]['words'] for verb in msg]
    return pd.Series(verbs).value_counts()

In [None]:
def generate_wordcloud(words_series: pd.Series):
    # Generate word cloud
    wordcloud = WordCloud(width = 2000, height = 1000, background_color='black').generate_from_frequencies(words_series)
    # Plot
    fig, ax = plt.subplots(figsize=(16,8), dpi=400)
    plt.axis("off")
    plt.imshow(wordcloud, interpolation="bilinear")

In [None]:
def get_most_used_bigrams(chat_df: pd.core.series.Series):
    '''get most frequent bigrams for a input chat'''
    bigrams = pd.Series([bigram for sublist in chat_df['bigrams'] for bigram in sublist])
    return bigrams.value_counts()

In [None]:
def get_most_used_trigrams(chat_df: pd.DataFrame) -> pd.Series:
    '''get most frequent bigrams for a input chat'''
    trigrams = pd.Series([x for _list in chat_df['trigrams'] for x in _list])
    return trigrams.value_counts()

In [None]:
trigrams = get_most_used_trigrams(chat.chat_df)

In [None]:
bigrams = get_most_used_bigrams(chat.chat_df)

In [None]:
bigrams.head(30)

In [None]:
trigrams.head(50)

In [None]:
plt.figure(figsize=(16,8))
bigrams.sort_values(ascending=True).tail(30).plot(kind='barh')

In [None]:
def get_most_used_curse_words(chat_df):
    with open('schimpfwortliste.txt', 'r') as infile:
        curse_words = infile.readlines()
        for idx, word in enumerate(curse_words):
            curse_words[idx] = word.replace('\n', '')

    words = get_most_used_words(chat_df)
    curse_word_usage = [word for word in words.index if word in curse_words]
    return words[curse_word_usage]

In [None]:
generate_wordcloud(get_most_used_curse_words(chat_df))

In [None]:
chat_df['sender'].unique()

# Basic statistics and plots

In [None]:
def plot_curse_word_usage(chat_df: pd.core.frame.DataFrame):
    curse_word_sender_list = []
    total_msg_counts = chat_df['sender'].value_counts()

    for sender in chat_df['sender'].unique():
        curse_word_sum = get_most_used_curse_words(chat_df[chat_df['sender'] == sender]).sum()
        total_messages_sum = total_msg_counts[sender]
        curse_word_sender_list.append((sender, curse_word_sum, total_messages_sum))
        
    curse_df = pd.DataFrame(curse_word_sender_list, columns=['sender', 'curse_word_count', 'total_msg_count'])
    curse_df['curse_rate'] = curse_df['curse_word_count'] / curse_df['total_msg_count']
    
    fig, ax = plt.subplots(1, 2,sharey=True, figsize=(16,8))
    plt.subplots_adjust(wspace = 0.05)
    fig.suptitle('Who is the biggest pig in the chat?')
    
    ax[0].set_title('Curse word count')
    ax[1].set_title('Curse rates') 
    
    curse_word_count = sns.barplot(ax=ax[0],
                                   data=curse_df,
                                   x='curse_word_count',
                                   y='sender',
                                   order=total_msg_counts[:8].index)
    
    curse_word_rate = sns.barplot(ax=ax[1],
                                  data=curse_df,
                                  x='curse_rate',
                                  y='sender',
                                  order=total_msg_counts[:8].index)
    curse_word_rate.set(ylabel=None)
    
    plt.show()
    
    

In [None]:
plot_curse_word_usage(chat_df)

In [None]:
def plot_url_usage(chat_df: pd.core.frame.DataFrame):
    fig = plt.figure(figsize=(16,8), dpi=400)
    g = sns.countplot(data=chat_df,
                      x='url_domain',
                      order=chat_df['url_domain'].value_counts()[:15].index,
                      hue='sender',
                      hue_order=chat_df['sender'].value_counts()[:8].index,
                      )
    g.set_xticklabels(g.get_xticklabels(),rotation=30)
    plt.legend(loc='upper right')
    plt.show()

In [None]:
plot_url_usage(chat_df)

In [None]:
def get_mean_messages_per_day(chat_df: pd.core.frame.DataFrame):
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.Series(pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()).value_counts(sort=False) 

    # value counts for each day that messages were sent
    num_chat_per_day = chat_df['date'].value_counts(sort=False)
    
    # merge both series and subtract 1 because timeframe == 1 for every day
    merged_value_counts = num_chat_per_day + timeframe - 1
    merged_value_counts = merged_value_counts.fillna(0)
    
    return np.array(merged_value_counts).mean()

In [None]:
def plot_daily_activity(chat_df: pd.core.frame.DataFrame):
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.Series(pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()).value_counts(sort=False)

    # value counts for each day that messages were sent
    num_chat_per_day = chat_df['date'].value_counts(sort=False)
    
    # merge both series and subtract 1 because value counts of timeframe == 1 for every day
    merged_value_counts = num_chat_per_day + timeframe - 1
    merged_value_counts = merged_value_counts.fillna(0)
    
    fig, ax = plt.subplots(figsize=(16,8), dpi=400)
    # plot chat activity by day
    ax.plot(merged_value_counts, label='Anzahl Nachrichten pro Tag')
    # plot mean value
    ax.plot(merged_value_counts.index,
            # generate dummy array
            np.full(len(merged_value_counts.index), merged_value_counts.mean()),
            '--',
            linewidth=3,
            label=f'Durchschnitt: {merged_value_counts.mean():.3f}')
    
    # peak detection and plotting
    peaks = find_peaks(merged_value_counts.values,distance=len(timeframe)/30, threshold=0.9)[0]
    ax.plot(merged_value_counts[peaks].index, merged_value_counts[peaks].values, "xr")
    
    # add axis labels and legend
    ax.set_ylabel('Anzahl Nachrichten')
    ax.set_xlabel('Datum')
    ax.set_title('Nachrichten im Zeitverlauf')
    ax.legend(title_fontsize='large')

In [None]:
plot_daily_activity(chat_df)

In [None]:
get_mean_messages_per_day(chat_df)

In [None]:
def plot_time_to_answer_weekdays(chat_df: pd.core.frame.DataFrame):
    """
    Plot answer time distributed across weekdays for the top six senders
    """    
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['answer_time_seconds']/60,
                      x=chat_df['sender'],
                      order=chat_df['sender'].value_counts().iloc[:8].index,
                      hue=chat_df['weekday'],
                      hue_order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
                      showfliers=False)
    ax1.set_title('Antwortzeit in Minuten')
    ax1.set_ylabel('Antwortzeit in Minuten')
    ax1.set_xlabel('Sender')
    ax1.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)

    plt.show()

In [None]:
plot_time_to_answer_weekdays(chat_df)

In [None]:
def plot_time_to_answer(chat_df: pd.core.frame.DataFrame):
    """
    Plot a boxplot of the answer time of the top eight senders of the chat 
    """
    median_answer_times = chat_df[chat_df['sender'].isin(chat_df['sender'].value_counts()[:8].index)].groupby('sender')['answer_time_seconds'].median()/60
        
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['answer_time_seconds']/60,
            x=chat_df['sender'],
            order=chat_df['sender'].value_counts()[:8].index.sort_values(),
            showfliers=False)
    
    for xtick in ax1.get_xticks():
        ax1.text(xtick, median_answer_times[xtick] + 0.1, np.around(median_answer_times[xtick], 3), 
                     horizontalalignment='center',size='large',color='w',weight='semibold')
    
    ax1.set_title('Antwortzeit in Minuten')
    ax1.set_ylabel('Antwortzeit in Minuten')
    ax1.set_xlabel('Sender')
    plt.show()

In [None]:
plot_time_to_answer(chat_df)

In [None]:
def get_mean_time_to_answer(chat_df: pd.core.frame.DataFrame):
    return chat_df['answer_time'].mean()

In [None]:
def longest_time_of_no_chatting(chat_df):
    # get max(time delta) of all messages
    return chat_df['time_diff'].max()

In [None]:
longest_time_of_no_chatting(chat_df)

In [None]:
def get_initiator_percentage(chat_df):
    return chat_df[chat_df['message_type'] == 'initiation']['sender'].value_counts(normalize=True)

In [None]:
get_initiator_percentage(chat_df)

In [None]:
def get_most_used_emojis(chat_df: pd.core.frame.DataFrame):
    emojis = [emoji for sublist in chat_df['emojis'] for emoji in sublist]
    return pd.Series(emojis).value_counts()

In [None]:
get_most_used_emojis(chat_df)

In [None]:
def plot_emoji_usage(chat_df: pd.core.frame.DataFrame):
    """
    Takes chat_df and creates plot with most used emojis
    """
    
    emoji_df = pd.DataFrame()
    for sender in chat_df['sender'].unique():
        cache_df = pd.DataFrame()
        emojis_sender = get_most_used_emojis(chat_df[chat_df['sender'] == sender])
        cache_df = pd.DataFrame({'sender': sender,
                                 'emoji': emojis_sender.index,
                                 'emoji_count': emojis_sender})
        emoji_df = emoji_df.append(cache_df, ignore_index=True).reset_index(drop=True)
    
    plt.figure(figsize=(16,8))
    g = sns.barplot(data=emoji_df,
                    x='emoji',
                    y='emoji_count',
                    order= get_most_used_emojis(chat_df).head(10).index,
                    hue='sender',
                    hue_order=chat_df['sender'].value_counts()[:8].index,
                    )

    g.xaxis.set_tick_params(labelsize=26)
    g.yaxis.set_tick_params(labelsize=16)
    g.set(xlabel=None, ylabel='Emoji Count')
    g.legend(fontsize=16)
    plt.yticks(fontname="Segoe UI Emoji")

In [None]:
plot_emoji_usage(chat_df)

In [None]:
def get_weekday_activity(chat_df: pd.core.frame.DataFrame, weekday:str):
    '''get total number of sent messages during the time of a given weekday'''
    
    # check if weekday is valid input
    if weekday in chat_df['weekday'].values:
        pass
    else:
        raise ValueError()
        
    # create time range of all 24h of the day
    time_range = pd.date_range("00:00:00", periods=24, freq="1h").time
    
    # iterate over time range and get message count at each hour
    msg_counter = []
    for time_slot in time_range:
        num_msg_per_hour = chat_df[(chat_df['weekday'] == weekday) &
                                   (chat_df['hour'] == time_slot.hour)].count().values[0]
        msg_counter.append((weekday, time_slot.hour, num_msg_per_hour))
        
    return msg_counter

In [None]:
def plot_weekday_activity(chat_df: pd.core.frame.DataFrame):
    '''plot a heatmap with the chat activity per day and hour'''
    
    ## data
    # prepare data
    weekday_df = pd.DataFrame()
    for weekday in chat_df['weekday'].unique():
        cache_df = pd.DataFrame(get_weekday_activity(chat_df, weekday), columns=['weekday', 'hour', 'message_count'])
        weekday_df = weekday_df.append(cache_df)
    
    # get the df into the right form
    weekday_df = weekday_df.set_index(['hour', 'weekday'])
    weekday_df = weekday_df.unstack(fill_value = 0)
    
    # get weekdays into right order
    weekday_df = weekday_df.droplevel(0, axis=1)
    weekday_df = weekday_df.filter(['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])
    
    ## plot
    # figure
    fig, ax = plt.subplots(figsize=(16, 9), dpi=300)
    # plot heatmap
    sns.heatmap(data = weekday_df, linewidth=0.2, cmap='Blues')

    # yticks
    yticks_labels = ["{time_1:02}:00 - {time_2:02}:00".format(time_1=x, time_2=x+1) for x in range(24)]
    plt.yticks(np.arange(24) + .5, labels=yticks_labels,rotation=0)
    # xticks
    ax.xaxis.tick_top()
    # axis labels
    plt.xlabel('')
    plt.ylabel('Tageszeit')
    # title
    plt.title("ChataktivitÃ¤t im Tagesverlauf")

In [None]:
plot_weekday_activity(chat_df)

In [None]:
from scipy.signal import find_peaks

In [None]:
def get_message_peak_dates(chat_df: pd.core.frame.DataFrame):
    """
    Takes chat_df and returns the dates of peaks in messages sent
    """
    
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.Series(pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()).value_counts(sort=False)

    # value counts for each day that messages were sent
    num_chat_per_day = chat_df['date'].value_counts(sort=False)
    
    # merge both series and subtract 1 because value counts of timeframe == 1 for every day
    merged_value_counts = num_chat_per_day + timeframe - 1
    merged_value_counts = merged_value_counts.fillna(0)
    
    # peak detection
    peaks = find_peaks(merged_value_counts.values,distance=len(timeframe)/30, threshold=0.9)[0]
    peak_dates = merged_value_counts[peaks].index # type: pandas.core.indexes.datetimes.DatetimeIndex
    # we need to convert them to array of datetime.date
    peak_dates = pd.to_datetime(peak_dates).date
    return peak_dates

In [None]:
peak_dates = get_message_peak_dates(chat_df)

In [None]:
peak_dates[12]

In [None]:
import nltk
import spacy
from gensim import corpora
import gensim
from spacy.lang.de import German

In [None]:
bigrams = get_most_used_bigrams(chat_df)

In [None]:
bigrams_relative = bigrams / len(bigrams)

In [None]:
lemmas = get_most_used_lemmas(chat_df)

In [None]:
lemmas_relative = lemmas / len(lemmas)

In [None]:
lemmas_relative

In [None]:
bigrams.index[0][0]

In [None]:
lemmas_relative[bigrams.index[0][0]]

In [None]:
lemmas_relative[bigrams.index[0][1]]

In [None]:
def calc_pmi(chat_df):
    # get bigrams
    bigrams = get_most_used_bigrams(chat_df)
    bigrams_relative = bigrams / len(bigrams)
    
    # get lemmas
    lemmas = get_most_used_lemmas(chat_df)
    lemmas_relative = lemmas / len(lemmas)
    
    # calc pmi
    output_dict = {}
    for bigram in bigrams_relative.index:
        px_y = bigrams_relative[bigram]
        px = lemmas_relative[bigram[0]]
        py = lemmas_relative[bigram[1]]

        pmi = np.log(px_y / (px*py))
        output_dict[str(bigram)] = pmi
    
    pmi_series = pd.Series(output_dict).sort_values(ascending=False)
    
    return pmi_series

In [None]:
all_pmi = calc_pmi(chat_df)

In [None]:
peak_bigrams = get_most_used_bigrams(chat_df[chat_df['date'] == peak_dates[9]])

In [None]:
output_dict = {}
for bigram in peak_bigrams.index:
    output_dict[str(bigram)] = all_pmi[str(bigram)]
    
peak_pmi = pd.Series(output_dict).sort_values(ascending=False)

In [None]:
peak_pmi.head(30)

In [None]:
output_dict = {}
for bigram in bigrams_relative.index:
    px_y = bigrams_relative[bigram]
    px = lemmas_relative[bigram[0]]
    py = lemmas_relative[bigram[1]]
    
    pmi = np.log(px_y / (px*py))
    output_dict[bigram] = pmi
    

In [None]:
pd.Series(output_dict).sort_values(ascending=False)[50:100]

In [None]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline

# vocabulary = ['this', 'document', 'first', 'is', 'second', 'the',
#                'and', 'one']
# pipe = Pipeline([('count', CountVectorizer(vocabulary=vocabulary)),
#                   ('tfid', TfidfTransformer())]).fit(corpus)

# pipe['count'].transform(corpus).toarray()

# pipe['tfid'].idf_

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit_transform(text_data.values)

In [None]:
tfidf_vectorizer.transform(['saufen'])

In [None]:
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
#ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=10)
for topic in topics:
    print(topic)

In [None]:
pd.set_option('display.max_colwidth', None) 

In [None]:
chat_df[chat_df['date'] == (peak_dates[5])][['sender','demojized_msg']][50:100]

# to be done:

In [None]:
# solve all problems with .apply() :-/ --> way faster and cleaner

In [None]:
chat_df.columns

In [None]:
chat_df['time_diff'].describe()

In [None]:
# some function that predicts the "one long messages" vs. "multiple short messages in a row" type of sender thing

In [None]:
def get_chat_topic_for_peak_chat_activity():
    pass

# get first and last chat dates and create a range of dates
first_chat_date=chat_df['date'].min()
last_chat_date = chat_df['date'].max()
timeframe = pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()

# iterate over all dates and count how many messages were sent
num_chat_per_day = [chat_df[chat_df['date'] == date.date()].count().values[0] for date in timeframe]

In [None]:
def plot_media_sent():
    fig, ax = plt.figure(figsize=(16,8), dpi=300)
    

In [None]:
chat_df[chat_df['is_media'] == True]['sender'].value_counts()