In [1]:
import re
import pandas as pd
import numpy as np

In [73]:
def starts_with_date_and_time(string):
    pattern = '^([0-9]+)(/)([0-9]+)(/)([0-9][0-9]) ([0-9]+):([0-9][0-9]) -'
    result = re.match(pattern, string)
    if result:
        return True
    return False

In [74]:
def get_data_point(line):   
    splitLine = line.split(' - ')
    dateTime = splitLine[0]
    date, time = dateTime.split(' ') 
    message = ' '.join(splitLine[1:])
    splitMessage = message.split(': ')
    author = splitMessage[0]
    message = ' '.join(splitMessage[1:])
    return date, time, author, message

In [75]:
parsedData = [] # List to keep track of data so it can be used by a Pandas dataframe
### Uploading exported chat file
conversationPath = '../data/all.txt' # chat file
with open(conversationPath, encoding="utf-8") as fp:
    ### Skipping first line of the file because contains information related to something about end-to-end encryption
    fp.readline() 
    messageBuffer = [] 
    date, time, author = None, None, None
    while True:
        line = fp.readline() 
        if not line: 
            break
        line = line.strip()
        if starts_with_date_and_time(line):
            if len(messageBuffer) > 0: 
                parsedData.append([date, time, author, ' '.join(messageBuffer)]) 
            messageBuffer.clear()
            date, time, author, message = get_data_point(line) 
            messageBuffer.append(message)
        else:
            messageBuffer.append(line)
df = pd.DataFrame(parsedData, columns=['Date', 'Time', 'Author', 'Message']) # Initialising a pandas Dataframe.
### changing datatype of "Date" column.
df["Date"] = pd.to_datetime(df["Date"])
df["Time"] = pd.to_datetime(df["Time"], format='%H%M', errors='ignore')

In [76]:
df.head()

Unnamed: 0,Date,Time,Author,Message
0,2019-12-20,21:17,Leticia Olábarri,Adri
1,2019-12-20,21:17,Leticia Olábarri,¿Vienes?
2,2019-12-20,22:02,Adri Ávalos,Sí 😬
3,2019-12-20,22:03,Adri Ávalos,Ya voy
4,2019-12-20,22:03,Adri Ávalos,Me he liado un pelin


In [77]:
df.shape

(59390, 4)

In [78]:
df.dtypes

Date       datetime64[ns]
Time               object
Author             object
Message            object
dtype: object

In [None]:
df["Author"].unique()

In [80]:
df.drop(df[(df['Author'] == 'Los mensajes y las llamadas están cifrados de extremo a extremo. Nadie fuera de este chat, ni siquiera WhatsApp, puede leerlos ni escucharlos. Toca para obtener más información.')].index, inplace=True)
df.drop(df[(df['Author'] == '\u200eMarcela Duque cambió su número de teléfono. Toca para enviar un mensaje o añadir el nuevo número.')].index, inplace=True)
df.drop(df[(df['Author'] == '\u200eMarcela Duque cambió su número de teléfono. Actualmente te estás comunicando con su nuevo número. Toca para añadir a tus contactos.')].index, inplace=True)

In [None]:
df["Author"].unique()

In [81]:
df.shape

(59150, 4)

In [82]:
authors = df["Author"].unique()
len(authors)

240

In [83]:
df.drop(df[(df['Message'] == '<Multimedia omitido>')].index, inplace=True)

In [84]:
df.shape

(55007, 4)

In [85]:
df["Message total length"] = df["Message"].apply(lambda x: len(x))

In [86]:
df.head()

Unnamed: 0,Date,Time,Author,Message,Message total length
0,2019-12-20,21:17,Leticia Olábarri,Adri,4
1,2019-12-20,21:17,Leticia Olábarri,¿Vienes?,8
2,2019-12-20,22:02,Adri Ávalos,Sí 😬,4
3,2019-12-20,22:03,Adri Ávalos,Ya voy,6
4,2019-12-20,22:03,Adri Ávalos,Me he liado un pelin,20


In [87]:
df.drop(df[(df['Message total length'] < 10)].index, inplace=True)

In [88]:
df.shape

(43096, 5)

In [89]:
def count_words(string):
    word_list = string.split()

    number_of_words = len(word_list)
    return number_of_words

In [90]:
df["Number of words"] = df["Message"].apply(lambda x: count_words(x))

In [91]:
df.head()

Unnamed: 0,Date,Time,Author,Message,Message total length,Number of words
4,2019-12-20,22:03,Adri Ávalos,Me he liado un pelin,20,5
5,2019-12-20,22:09,Leticia Olábarri,Vaaaaaamooooooossssss,21,1
7,2019-12-20,22:14,Leticia Olábarri,Avisa cuando llegues,20,3
9,2019-12-20,22:15,Adri Ávalos,Por dónde estáis ?,18,4
11,2019-12-20,22:15,Leticia Olábarri,Enfrente del escenario,22,3


In [92]:
def count_intro_signs(string):
    count = 0
    for i in string:
        if (i == "¿") or (i == "¡"):
            count = count + 1
    return count

In [93]:
df["Number of intro signs (¿ or ¡)"] = df["Message"].apply(lambda x: count_intro_signs(x))

In [94]:
df.head()

Unnamed: 0,Date,Time,Author,Message,Message total length,Number of words,Number of intro signs (¿ or ¡)
4,2019-12-20,22:03,Adri Ávalos,Me he liado un pelin,20,5,0
5,2019-12-20,22:09,Leticia Olábarri,Vaaaaaamooooooossssss,21,1,0
7,2019-12-20,22:14,Leticia Olábarri,Avisa cuando llegues,20,3,0
9,2019-12-20,22:15,Adri Ávalos,Por dónde estáis ?,18,4,0
11,2019-12-20,22:15,Leticia Olábarri,Enfrente del escenario,22,3,0


In [95]:
def count_stops_and_commas(string):
    count = 0
    for i in string:
        if (i == ".") or (i == ","):
            count = count + 1
    return count

In [96]:
df["Number of stops and commas"] = df["Message"].apply(lambda x: count_stops_and_commas(x))

In [97]:
df.head()

Unnamed: 0,Date,Time,Author,Message,Message total length,Number of words,Number of intro signs (¿ or ¡),Number of stops and commas
4,2019-12-20,22:03,Adri Ávalos,Me he liado un pelin,20,5,0,0
5,2019-12-20,22:09,Leticia Olábarri,Vaaaaaamooooooossssss,21,1,0,0
7,2019-12-20,22:14,Leticia Olábarri,Avisa cuando llegues,20,3,0,0
9,2019-12-20,22:15,Adri Ávalos,Por dónde estáis ?,18,4,0,0
11,2019-12-20,22:15,Leticia Olábarri,Enfrente del escenario,22,3,0,0


In [98]:
def count_strange_words(string):
    count = 0
    word_list = string.split()
    if (" xfa " in word_list) or (" q " in word_list):
        count = count + 1

In [99]:
df["Number of attypical abbreviations"] = df["Message"].apply(lambda x: count_strange_words(x))

In [100]:
df.head()

Unnamed: 0,Date,Time,Author,Message,Message total length,Number of words,Number of intro signs (¿ or ¡),Number of stops and commas,Number of attypical abbreviations
4,2019-12-20,22:03,Adri Ávalos,Me he liado un pelin,20,5,0,0,
5,2019-12-20,22:09,Leticia Olábarri,Vaaaaaamooooooossssss,21,1,0,0,
7,2019-12-20,22:14,Leticia Olábarri,Avisa cuando llegues,20,3,0,0,
9,2019-12-20,22:15,Adri Ávalos,Por dónde estáis ?,18,4,0,0,
11,2019-12-20,22:15,Leticia Olábarri,Enfrente del escenario,22,3,0,0,
