In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import emoji
import re

In [None]:
def load_chat(path_to_chat: str) -> list:
    with open(path_to_chat, 'r', encoding = 'UTF-8') as infile:
        # read whole file
        chat = infile.read()
        # split lines at newline (not an actual CRLF!)
        chat = chat.split('\n') #type(list)
        # remove \u200e lines since they are only contained in auto generated info messages from Whatsapp
        chat = [line for line in chat if r'\u200e' not in repr(line)]
        chat = [line for line in chat if "Sicherheitsnummer" not in line]
        # delete last entry because it's the file's end and contains only "\n"
        del chat[-1]
        
        return chat #type(list)

In [None]:
class UnknownChatFormat(Exception):
    '''Raised when chat format or device could not be detected'''
    def __init__(self, message="Unknown chat format: could not detect if android or iOS was used"):
        self.message = message
        super().__init__(self.message)

In [None]:
def determine_chat_format(chat: list) -> str:
    '''find out what device was used to export the chat file.
    Android format is different from iOS in terms of timestamp.
    iOS: "[dd.mm.yy, HH:MM:SS]"
    Android: "dd.mm.yy, HH:MM:SS"
    Basicly the same but without the brackets'''
    
    # take 20 random messages out of the chat and check their format
    ### ANDROID: 0; iOS: 1 ###
    
    result_list = []
    
    for n in np.random.randint(0, high=len(chat), size=20):
        if str(chat[n]).startswith('['):     
            result_list.append(1)
        elif str(chat[n])[0].isdigit():
            result_list.append(0)
        else:
            continue
    try:
        result = sum(result_list) / len(result_list)
    except ZeroDivisionError as exp:
        raise UnknownChatFormat

    if result > 0.9:
        return "ios"
    elif result < 0.1:
        return "android"
    else:
        raise UnknownChatFormat()

In [None]:
def check_message_integrity_ios(chat:list) -> list:
    '''check if line is a valid message with timestamp, sender and message
    sometimes lines are cut of by CRLF respectively \n in this case.
    Put split messages back together in this case.'''

    ## if it's an iOS chat
    # check if all lines start with '[' and get indices of split messages
    split_messages_idx = [idx for idx, line in enumerate(chat) if not line.startswith('[')]

    # make sure indices are sorted so chat list indices don't get fucked up when deleting indices
    for idx in sorted(split_messages_idx, reverse=True):
        # iterate over split messages and merge them with the message send before
        merged_message = chat[idx-1] + ' ' + chat[idx] #type(str)
        chat[idx-1] = merged_message
        
        # delete split messages by index after merging
        del chat[idx]   
    
    return chat #type(list)

In [None]:
def check_message_integrity_android(chat:list) -> list:
    '''check if line is a valid message with timestamp, sender and message
    sometimes lines are cut of by CRLF respectively \n in this case.
    Put split messages back together in this case.'''

    ## if it's an android chat
    # check if all lines start with a timestamp and get indices of split messages
    split_messages_idx = []
    for idx, line in enumerate(chat):
        try:
            datetime.datetime.strptime(line[:15], '%d.%m.%y, %H:%M')
        except ValueError:
            split_messages_idx.append(idx)

    # make sure indices are sorted so chat list indices don't get fucked up when deleting indices
    for idx in sorted(split_messages_idx, reverse=True):
        # iterate over split messages and merge them with the message send before
        merged_message = chat[idx-1] + ' ' + chat[idx] #type(str)
        chat[idx-1] = merged_message
        
        # delete split messages by index after merging
        del chat[idx]
    
    return chat #type(list)

In [None]:
def parse_date_ios(line: str) -> datetime.datetime:
    # split every line of chat between the first brackets
    date_string = line.split('[')[1].split(']')[0]
    # create datetime obj from remaining date format dd.mm.yy, HH:MM:SS
    message_date = datetime.datetime.strptime(date_string, '%d.%m.%y, %H:%M:%S') # type(datetime.datetime)
    return message_date

In [None]:
def parse_date_android(line: str) -> datetime.datetime:
    # split every line of chat after "-"
    date_string = line.split('-')[0].strip()
    # create datetime obj from remaining date format dd.mm.yy, HH:MM:SS
    message_date = datetime.datetime.strptime(date_string, '%d.%m.%y, %H:%M') # type(datetime.datetime)
    return message_date

In [None]:
def get_message_sender_android(line: str) -> str:
    # split string between "-" and ":" to get sender of the message
    return line.split('-')[1].split(':')[0].strip()

In [None]:
def get_message_sender_ios(line: str) -> str:
    # split string between timestamp and ":" to get sender of the message
    return line.split(']')[1].split(':')[0].strip()

In [None]:
def chop_message_ios(line: str) -> str:
    # chop timestamp and sender to only get raw text
    # therefore split at 3rd ':', which indicates message start after sender tag
    return line.split(':')[3].strip() #type(str)

In [None]:
def chop_message_android(line: str) -> str:
    # chop timestamp and sender to only get raw text
    # therefore split at 2nd ':', which indicates message start after sender tag
    return line.split(':')[2].strip() #type(str)

In [None]:
def guess_christophers_alias(chat_df: pd.core.frame.DataFrame) -> str:
    senders = chat_df['sender'].unique()
    for elem in senders:
        if "christopher" in elem.lower() or "chris" in elem.lower():
            return elem #type(str)

In [None]:
def parse_emojis(message: str) -> (list, str):
    import emoji
    ### GET EMOJI LIST ###
    
    # decode all emojis into "demojized" syntax (e.g. :grinning_face:)
    demojized_message = emoji.demojize(message)
    # regex the line and look for emojis
    demojized_message = re.findall(r'(:[^:]*:)', demojized_message)
    # encode into emojis again
    emojis_in_message = [emoji.emojize(x, use_aliases=True) for x in demojized_message]
    
    
    ### remove emojis from message ###
    
    # we need to do it all over again because somehow emoji fucks up string obj
    # do it over again and delete emojis from message
    demojized_message = emoji.demojize(message)
    raw_emojis = re.findall(r'(:[^:]*:)', demojized_message)
    # delete all emojis from message
    for emoji in raw_emojis:
        demojized_message = demojized_message.replace(emoji, '')
    
    return emojis_in_message, demojized_message

In [None]:
def extract_date_from_timestamp(input_timestamp: datetime.datetime) -> datetime.date:
    return input_timestamp.date()

In [None]:
def extract_time_from_timestamp(input_timestamp) -> datetime.time:
    return input_timestamp.time()

In [None]:
def calc_time_diff(chat_df: pd.core.frame.DataFrame) -> pd.core.frame.DataFrame:
    
    '''
    iterate over chat_df and annotate time between messages
    
    '''
    
    for idx, row in chat_df.iterrows():

        if idx == 0:
            # first message has no time diff
            continue

        timestamp_b = row['timestamp']
        timestamp_a = chat_df.iloc[idx-1]['timestamp']
        time_delta = timestamp_b - timestamp_a
        chat_df.at[idx, 'time_diff'] = time_delta
                
    return chat_df

In [None]:
def annotate_message_types(chat_df: pd.core.frame.DataFrame, 
                      answer_time_threshold: datetime.timedelta=datetime.timedelta(days=2)
                     ) -> pd.core.frame.DataFrame:
    
    '''iterate over chat_df and check if it is an answer, follow up or new initiation
    
    ################ EXAMPLE ####################
    ## [MSG_A] day1 10:00 sender_a: HI!
    ## [MSG B] day1 10:05 sender_b: Hi!
    ## --> then MSG B is an answer
    
    ## [MSG_A] day1 10:00 sender_a: bye!
    ## [MSG B] day9 12:05 sender_b: long time no see, cunt!
    ## --> then MSG B is a new initiation
    
    ## [MSG_A] day1 10:00 sender_a: hello?!?!?!
    ## [MSG B] day4 01:05 sender_a: fuxk yoi you piecr of ssshit!
    ## --> then MSG B is also a new initiation
    
    ## [MSG_A] day1 10:00 sender_a: can you bring me something from the store?
    ## [MSG B] day1 12:05 sender_a: some milk and icecream!
    ## --> then MSG B is a follow up
    #############################################
    '''
    
    for idx, row in chat_df.iterrows():

        if idx == 0:
            # first message is always an initiation and has no time diff or answer time
            chat_df.at[0, 'message_type'] = "initiation"
            continue
        
        # check if message is a picture or video
        if row['raw_message'] == "<Medien ausgeschlossen>":
            chat_df.at[idx, 'is_media'] = True
        else:
            chat_df.at[idx, 'is_media'] = False
        
        
        sender_b = row['sender']
        sender_a = chat_df.iloc[idx-1]['sender']
        time_delta = row['time_diff'] #type(datetime.timedelta)

        if sender_b == sender_a:
            if time_delta < answer_time_threshold:
                # if sender_a and sender_b are the same and time between the two messages is < answer_time_threshold,
                # then it's a "follow up"
                chat_df.at[idx, 'message_type'] = "follow_up"

            else:
                # if between messages is > answer_time_threshold, then it's a new initiation of the conversation
                # the recipient didn't respond :(
                chat_df.at[idx, 'message_type'] = "initiation"


        if sender_a != sender_b:
            if time_delta < answer_time_threshold:
                # if sender_a and sender_b are NOT the same and time between the two messages < answer_time_threshold,
                # then it's an answer
                chat_df.at[idx, 'message_type'] = "answer"
                chat_df.at[idx, 'answer_time_seconds'] = time_delta.seconds

            else:
                # if time is > answer_time_threshold then it's a new initiation (or maybe just a sorry? ¯\_(ツ)_/¯)
                chat_df.at[idx, 'message_type'] = "initiation"

                
    return chat_df

In [None]:
def parse_chat(path_to_chat: str) -> pd.core.frame.DataFrame:
    chat = load_chat(path_to_chat) #type(list)
    chat_format = determine_chat_format(chat) #type(str)
    
    if chat_format.lower() == "android":
        chat = check_message_integrity_android(chat) #type(list)
    elif chat_format.lower() == "ios":
        chat = check_message_integrity_ios(chat) #type(list)
    
    raw_message_list = []
    messages = []
    sender_list = []
    timestamps = []
    emoji_list = []
    
    if chat_format.lower() == "android":
        # iterate over every line (=message) and get attributes
        for idx, message in enumerate(chat):
            sender_list.append(get_message_sender_android(message))
            timestamps.append(parse_date_android(message))
            messages.append(chop_message_android(message))
            
    elif chat_format.lower() == "ios":
        # iterate over every line (=message) and get attributes
        for message in chat:
            sender_list.append(get_message_sender_ios(message))
            timestamps.append(parse_date_ios(message))
            messages.append(chop_message_ios(message))
        
    # iterate over message and parse / extract emojis
    for message in messages:
        emojis_, raw_message = parse_emojis(message)
        emoji_list.append(emojis_)
        raw_message_list.append(raw_message)
        
    assert len(sender_list) == len(timestamps) == len(messages) == len(emoji_list)
    
    # create df from the lists
    chat_df = pd.DataFrame({"sender":sender_list,
                            "timestamp": timestamps,
                            "message": messages,
                            "raw_message": raw_message_list,
                            "emojis": emoji_list})
    
    # further timestamp extractions
    chat_df['date'] = chat_df['timestamp'].apply(lambda x: extract_date_from_timestamp(x))
    chat_df['time'] = chat_df['timestamp'].apply(lambda x: extract_time_from_timestamp(x))
    chat_df['weekday'] = chat_df['timestamp'].apply(lambda x: x.strftime('%A'))
    
    # annotations for message type and answer time
    chat_df = calc_time_diff(chat_df)
    chat_df = annotate_message_types(chat_df)
    
    
    return chat_df

In [None]:
chat_df = parse_chat('chat_charlotte_fabi.txt')

In [None]:
chat_df

In [None]:
 christopher_alias = guess_christophers_alias(chat_df)

In [None]:
christopher_only = chat_df[chat_df['sender'] == christopher_alias]

In [None]:
christopher_only

In [None]:
def plot_daily_activity(chat_df: pd.core.frame.DataFrame):
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime()

    # iterate over all dates and count how many messages were sent
    num_chat_per_day = [chat_df[chat_df['date'] == date.date()].count().values[0] for date in timeframe]

    fig, ax = plt.subplots(figsize=(16,8))
    # plot chat activity by day
    ax.plot(timeframe,num_chat_per_day, label='Anzahl Nachrichten pro Tag')
    # plot mean value
    ax.plot(timeframe, np.full(timeframe.shape, get_mean_messages_per_day(chat_df)), '--',linewidth=3, label=f'Durchschnitt: {get_mean_messages_per_day(chat_df):.3f}')

    # add axis labels and legend
    ax.set_ylabel('Anzahl Nachrichten')
    ax.set_xlabel('Datum')
    ax.set_title('Nachrichten im Zeitverlauf')
    ax.legend(title_fontsize='large')

In [None]:
def get_mean_messages_per_day(chat_df: pd.core.frame.DataFrame):
    # get first and last chat dates and create a range of dates
    first_chat_date=chat_df['date'].min()
    last_chat_date = chat_df['date'].max()
    timeframe = pd.date_range(start=first_chat_date,end=last_chat_date).to_pydatetime() 
    # -> np.array[datetime.datetime, datetime.datetime, ....], has to be converted into datetime.date!

    # iterate over all dates and count how many messages were sent
    num_chat_per_day = [chat_df[chat_df['date'] == date.date()].count().values[0] for date in timeframe]
    
    return np.array(num_chat_per_day).mean()

In [None]:
plot_daily_activity(chat_df)

In [None]:
get_mean_messages_per_day(chat_df)

In [None]:
def plot_time_to_answer_weekdays(chat_df: pd.core.frame.DataFrame):
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['answer_time_seconds']/60,
            x=chat_df['sender'],
            hue=chat_df['weekday'],
            hue_order=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'],
            showfliers=False)
    ax1.set_title('Antwortzeit in Minuten')
    ax1.set_ylabel('Antwortzeit in Minuten')
    ax1.set_xlabel('Sender')
    ax1.legend(bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.)
    plt.show()

In [None]:
plot_time_to_answer_weekdays(chat_df)

In [None]:
def plot_time_to_answer(chat_df: pd.core.frame.DataFrame):
    fig1, ax1 = plt.subplots(figsize=(16,8))
    ax1 = sns.boxplot(y=chat_df['answer_time_seconds']/60,
            x=chat_df['sender'],
            showfliers=False)
    ax1.set_title('Antwortzeit in Minuten')
    ax1.set_ylabel('Antwortzeit in Minuten')
    ax1.set_xlabel('Sender')
    plt.show()

In [None]:
plot_time_to_answer(chat_df)

In [None]:
def get_mean_time_to_answer(chat_df: pd.core.frame.DataFrame):
    return chat_df['answer_time'].mean()

In [None]:
def longest_time_of_no_chatting(chat_df):
    # get time delta of all "initiations" and get max()
    time_delta = []
    for idx, row in chat_df.iterrows():
        if idx == 0:
            # skip first row
            continue
        if row['message_type'] == 'initiation':
            sender_a = row['sender']
            sender_b = chat_df.iloc[idx-1]['sender']
            if sender_a != sender_b:
                time_delta.append(chat_df.iloc[idx]['timestamp'] - chat_df.iloc[idx-1]['timestamp'])
    
    return sorted(time_delta, reverse=True)[0]

# to be done:

In [None]:
# solve all problems with .apply() :-/ --> way faster and cleaner

In [None]:
def get_time_diff_dist():
    # return statistics about time between messages to make a more reliable answer_time_threshold
    # see: annotate_message_types_and_time_diff()
    # could be re-done with adjusted answer_time_threshold

In [None]:
def get_initiator_percentage():
    pass

In [None]:
# some function that predicts the "one long messages" vs. "multiple short messages in a row" type of sender thing

In [None]:
def most_used_words(chat_df: pd.core.frame.DataFrame, christopher_alias: str):
    pass

In [None]:
def most_used_emoji(chat_df: pd.core.frame.DataFrame, christopher_alias: str):
    pass
# messages with at least one emoji
chat_df[chat_df['emojis'].str.len().gt(0)]
    
# flatten emoji list
#emoji_list = [item for sublist in emoji_list for item in sublist]

# get messages with at least one emoji
#chat_df[chat_df['emojis'].str.len().gt(0)]

# emojize all raw emojis
#list_emoji = [emoji.emojize(x) for x in text]

In [None]:
def get_chat_topic_for_peak_chat_activity():
    pass

In [None]:
# more nlp stuff

In [None]:
# regex links, resolve them and do google / wikipedia search? Maybe to intimate

In [None]:
# progress bar or some kind of feedback

In [None]:
# logging

In [None]:
# API?