### Import all libraries

In [3]:
# -*- coding: utf-8 -*-

import pandas as pd
import re
import os
from datetime import datetime
import zipfile

import regex
import emoji

import numpy as np
from collections import Counter

import plotly.express as px
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
import random
import shutil
import sys

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### Set phone number patterns (by [Sandreke](https://github.com/Sandreke))

In [4]:
# Get country phone number patterns
try:
    phone_patterns = [line.strip().split(';')[1] + ":" for line in open('data/phone_patterns.txt', 'r')]
    print("Phone numbers loaded")
except:
    print("Phone numbers not loaded")

Phone numbers not loaded


## Settings: Set your values

In [5]:
# ATTENTION: Your 'my_chat.txt' files must be in the data folder.

chat_file_name = 'chat.txt' # Whatsapp chat zip file or txt file


chat_path = os.path.join('input', chat_file_name)
chat_name = chat_file_name[:-4]

is_zip = False #Read a .zip file or .txt file
delete_input = False # DELETE INPUT (ZIP or TXT)

is_group = False # False -> Personal chat - True -> Group Chat
group_name = 'GroupName'

has_range_time = False
start_time, end_time = '1/1/23', '31/12/23'


THIS_PATH = os.path.dirname(os.path.join(os.getcwd(), 'wpp-visualization'))
save_path = os.path.join('generated', chat_name)

### Utils functions
count_emojis, convert_to_24_format, procces_message, is_date_between, get_save_path, custom_color

In [6]:
color_palette = [
    (254, 245, 231),  # Background color: a soft cream tone
    (255, 105, 180),  # Bright Pink
    (255, 182, 193),  # Light Pink
    (255, 0, 0),      # Red
    (220, 20, 60),    # Crimson
    (255, 192, 203),  # Pink
    (233, 150, 122),  # Light Coral
    (255, 215, 0)     # Gold for a touch of elegance
]

def create_input_folder(folder_name, file_name):
    try:
        input_path = os.path.join(THIS_PATH, 'input')

        folder_path = os.path.join(input_path, folder_name)

        os.makedirs(folder_path, exist_ok=True)
        print(f'Se ha creado la carpeta "{folder_name}" dentro de "input".')

        file_path = os.path.join(input_path, file_name)

        shutil.move(file_path, folder_path)
        print(f'Se ha movido el archivo "{file_name}" a la carpeta "{folder_name}".')

    except Exception as e:
        print(f'Error: {e}')
    
    return folder_path + file_name

def extrac_zip(zip_path):
    """
    Extrac zip file in a new folder
    """
    zip_path = os.path.join(THIS_PATH, zip_path)

    carpeta_destino = os.path.join(THIS_PATH, os.path.splitext(zip_path)[0])

    os.makedirs(carpeta_destino, exist_ok=True)

    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(carpeta_destino)


def custom_color(word, font_size, position, orientation, random_state=None,
                    **kwargs):
    return "rgb(%s, %s, %s)" % color_palette[random.randint(1, len(color_palette)-1)]

def clean_data(input_file: str, output_file: str):
    """
    Remove special characters
    """
    try:
        with open(input_file, 'r', encoding='utf-8') as file_in, open(output_file, 'w', encoding='utf-8') as file_out:
            for line in file_in:
                # Remove [U+200E]
                clean_line = line.replace('\u200e', '')
                file_out.write(clean_line)

    except FileNotFoundError:
        print(f'Error: not found {input_file}')

def count_emojis(message: str) -> dict:
    """
    Create a dict with the emojis presences in the message

    Args:
        message (str): message of the chat

    Returns:
        emoji_dict (dict): {😂: 3 , 😍: 1}
    """
    emojis = emoji.distinct_emoji_list(message)
    
    emoji_dict = {}
    for char in emojis:
        if char in emoji_dict:
            emoji_dict[char] += 1
        else:
            emoji_dict[char] = 1
    return emoji_dict


def convert_to_24_format(hour: str) -> str:
    """
    Convert various time formats to 24-hour format ("%H:%M:%S")

    Args:
        hour (str): Time string in various possible formats

    Returns:
        hour_in_24hs_format (str): Time in 'HH:MM:SS' format
    """
    time_formats = ["%I:%M %p", "%H:%M", "%I:%M:%S %p", "%H:%M:%S"]

    for format in time_formats:
        try:
            parsed_time = datetime.strptime(hour, format)

            # Format the parsed time in 24-hour format
            return parsed_time.strftime('%H:%M:%S')
        except ValueError:
            continue

    print("Formato de hora no válido")
    print("Modifica la regex para validarlo")
    return ""

def procces_message(message: str) -> tuple:
    """
    Parse a chat message to get: date, time, username, message_text

    Args:
        message (str): Chat message

    Returns:
        message_data (tuple): ((date, time, username, message_text), is_24h_format)
    """
    # AVIABLE FORMATS
    # [1/1/24, 14:14:00] Juan Manuel 😎: This is a message
    # 01/01/24, 2:14 pm - Juan Manuel 😎: This is a message
    # 1/1/24, 2:14 pm - Juan Manuel 😎: This is a message
    # 1/1/23 14:14 - Juan Manuel 😎: This is a message
    # 1/1/2024, 14:14 - Juan Manuel 😎: This is a message

    # RETURN
    # ('09/01/24', '09:06:00', 'Juan Manuel 😎', 'This is a message')
    # (('09/01/24', '09:06:00', 'Juan Manuel 😎', 'This is a message'), is_24_hours_format: bool)

    pattern = r'\[?(\d{1,2})[/](\d{1,2})[/](\d{2,4}),?\s*(\d{1,2}:\d{2}(?::\d{2})?)(?:\s?(am|pm))?[\s-]*\]?[ ]*(.+?): (.+)'
    match = re.match(pattern, message)

    if match:
        day, month, year, time, format, username, text = match.groups()

        # Set the year to YY format
        year = year[-2:]

        # Set the day and month to two-digit format
        day = day.zfill(2)
        month = month.zfill(2)

        # Reconstruct the date in the desired format
        date = f"{day}/{month}/{year}"

        # Check if the time format is present
        is_24h_format = format is None

        # Return processed data
        return ((date, time, username, text), is_24h_format)
    else:
        return((None, None, None, None), None)

def is_date_between(target_date_str: str, start_date_str: str, end_date_str: str) -> bool:
    """
    Check if message is into range of dates
    Args:
        target_date_str (str): Current date
        start_date_str (str): Start date range, '01/01/23'
        end_date_str (str): End date range, '31/12/23'

    Returns:
        is_in_range (bool):
    """
    target_date = datetime.strptime(target_date_str, "%d/%m/%y")
    start_date = datetime.strptime(start_date_str, "%d/%m/%y")
    end_date = datetime.strptime(end_date_str, "%d/%m/%y")

    return start_date <= target_date <= end_date

# TODO
def get_save_path(folder_path: str, file_name: str):
    """
    Create a folder where files are saved
    """
    full_path = os.path.join(folder_path, file_name)
    os.makedirs(folder_path, exist_ok=True)

    return full_path


## Get _chat.txt file

In [None]:
format_file = chat_path[-4:]

if format_file == '.zip':
    extrac_zip(chat_path)
    chat_path = f'input/{chat_name}/_chat.txt'
elif format_file == '.txt':
    chat_path = create_input_folder(chat_name, chat_file_name)

print(f"El archivo se encuentra en: {chat_path}")

### Create dataframe
With (Fecha, Hora, Miembro, Mensaje) [(Date, Time, Member, Message)] values.

In [7]:
# List to store the data 
# (Fecha, Hora, Miembro, Mensaje) 
# [(Date, Time, Member, Message)] 
# of each line of the txt
data_list = []

with open(chat_path, encoding="utf-8") as chat_file:
    chat_file.readline() # Ignore first row related to end-to-end encryption

    #Initialize values
    date, time, username, message_text = None, None, None, None
    while True:

        linea = chat_file.readline()

        if not linea:
            break
        linea = linea.strip()
        
        proccesed_message = procces_message(linea)
        
        date, time, username, message_text = proccesed_message[0]
        format_24_hs = proccesed_message[1]

        if date: # Is valid data
            if not format_24_hs:
                time = convert_to_24_format(time)
            
            if not has_range_time:
                data_list.append([date, time, username, message_text])
            
            else:
                if is_date_between(date, start_time, end_time):
                    data_list.append([date, time, username, message_text])

# For emojis
pd.set_option('display.unicode.ambiguous_as_wide', True)
pd.set_option('display.unicode.east_asian_width', True)

# Convert the list with data to dataframe
df = pd.DataFrame(data_list, columns=['Fecha', 'Hora', 'Miembro', 'Mensaje'])

# Delete null values
df = df.dropna()

# Reset index
df.reset_index(drop=True, inplace=True)

df

Unnamed: 0,Fecha,Hora,Miembro,Mensaje
0,25/09/24,8:01,p. m. - Anabel 🌷,Holiss
1,25/09/24,8:25,p. m. - Juanma Romano,Holis
2,25/09/24,8:25,p. m. - Juanma Romano,Cómo estás bestie?
3,25/09/24,8:25,p. m. - Juanma Romano,Cuando vamos por un helado?
4,25/09/24,8:30,p. m. - Anabel 🌷,<Multimedia omitido>
...,...,...,...,...
2425,27/10/24,7:44,p. m. - Juanma Romano,Se está viendo
2426,27/10/24,7:45,p. m. - Juanma Romano,WHAT
2427,27/10/24,7:45,p. m. - Juanma Romano,Diomio
2428,27/10/24,7:45,p. m. - Juanma Romano,Que horror


### Add the "message types" data based on the message pattern.

In [8]:
# Total messages
total_messages = df.shape[0]

# Type messages
# Get values according to the type of message
"""
This pattern can get count of: 
- images
- videos
- audio
- stickers
- contacts shared
- deleted mesagges
- gif's, 
- locations shared
- emojis
- url's shared
- pools
"""

# Images
images_send = df[
        (df['Mensaje'] == 'imagen omitida')
        ].shape[0]

# Videos
videos_sends = df[
        (df['Mensaje'] == 'Video omitido')
        ].shape[0]

# Audio
audio_send = df[(
        df['Mensaje'] == 'audio omitido')
        ].shape[0]

# Stickers
stickers_send = df[(
        df['Mensaje'] == 'sticker omitido')
        ].shape[0]

# Contacts shared
contacts_sends = df[(
        df['Mensaje'] == 'Tarjeta de contacto omitida')
        ].shape[0]

# Deleted messages
deleted_msgs = df[(
        df['Mensaje'] == 'Se eliminó este mensaje.')
        ].shape[0]

# Gif's
gif_send = df[(
        df['Mensaje'] == 'GIF omitido')
        ].shape[0]

# Locates shared
locate_pattern = r"Ubicación: https://maps\."
locate_sends = df[
        df['Mensaje'].str.contains(locate_pattern, flags=re.IGNORECASE, na=False)
        ].shape[0]

# Pools quantity
pools = df[
        (df['Mensaje'].str.contains('ENCUESTA:')) |
        (df['Mensaje'].str.contains('POOL:'))
        ].shape[0]

# Multimedia messages (images, video, etc)
multimedia_sends = df[
    df['Mensaje'].str.contains('<Media omitted>') |
    df['Mensaje'].str.contains('<Multimedia omitido>')
    ].shape[0]

# Edited messages
editd_msgs = df[
    (df['Mensaje'].str.contains('<This message was edited>')) |
    (df['Mensaje'].str.contains('<Este mensaje fue editado>')) 
    ].shape[0]

# Once voice messages
once_voice_sends = df[
    (df['Mensaje'].str.contains('<View once voice message omitted>')) |
    (df['Mensaje'].str.contains('<Ver una vez omitido el mensaje de voz>'))
    ].shape[0]

# Deleted messages
me_name = 'ME'
you_name = 'YOU/US' #can be a group

me_deleted = df[
    (df['Mensaje'].str.contains('You deleted this message')) |
    (df['Mensaje'].str.contains('Eliminaste este mensaje.'))
    ].shape[0]

you_deleted = df[
    (df['Mensaje'].str.contains('This message was deleted')) |
    (df['Mensaje'].str.contains('Se eliminó este mensaje.'))
    ].shape[0]

# Deleted messages
me_name = 'ME'
you_name = 'YOU/US' #can be a group

# Emojies sends
df['Emojis'] = df['Mensaje'].apply(count_emojis) # Add 'Emoji' column
emojis = sum(df['Emojis'].str.len())

# Links shared
url_pattern = r'(https?://\S+)'
df['URLs'] = df.Mensaje.apply(lambda x: len(re.findall(url_pattern, x))) # Add 'URLs' column
links = sum(df['URLs'])

# Stats dict
stats_dict = {
        'Tipo': ['Mensajes', 
                 'Imagenes', 
                 'Videos', 
                 'Gifs', 
                 'Audios', 
                 'Emojis', 
                 'Stickers', 
                 'Ubicaciones', 
                 'Contactos',
                 'Links', 
                 'Mensajes eliminados', 
                 'Encuestas',
                 "Multimedia",
                 "Editados",
                 "Audios efimeros",
                 f"Borrado por {me_name}",
                 f"Borrado por {you_name}"
                 ],
        'Cantidad': [total_messages, 
                     images_send, 
                     videos_sends, 
                     gif_send, 
                     audio_send, 
                     emojis, 
                     stickers_send, 
                     locate_sends, 
                     contacts_sends, 
                     links, 
                     deleted_msgs, 
                     pools,
                     multimedia_sends,
                     editd_msgs,
                     once_voice_sends,
                     me_deleted,
                     you_deleted
                     ]
}


# Dict to dataframe
stats_df = pd.DataFrame(stats_dict, columns = ['Tipo', 'Cantidad'])

a = stats_dict['Cantidad'][0]
b = sum(stats_dict['Cantidad'][1:])

print(f"Tienes {b} de {a} mensajes en la tabla")

# Set Type column as index
stats_df = stats_df.set_index('Tipo')
stats_df




Tienes 713 de 2430 mensajes en la tabla


Unnamed: 0_level_0,Cantidad
Tipo,Unnamed: 1_level_1
Mensajes,2430
Imagenes,0
Videos,0
Gifs,0
Audios,0
Emojis,91
Stickers,0
Ubicaciones,0
Contactos,0
Links,11


### Messages type
Shows a pie chart with message types

In [9]:
fig_type_msg = px.pie(stats_df, values='Cantidad', names=stats_df.index, title='Tipos de mensajes')
fig_type_msg.update_traces(textposition='inside', textinfo='percent+label')

fig_type_msg.write_image(get_save_path(save_path, f'{chat_name}-type-message2.png'))
fig_type_msg.show()

### Emoji quantity
Show the total number of emojis and unique emojis

In [10]:
emoji_list = list([a for b in df.Emojis for a in b])
emoji_dict = dict(Counter(emoji_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)

emoji_df = pd.DataFrame(emoji_dict, columns=['Emoji', 'Cantidad'])

emoji_df = emoji_df.set_index('Emoji')

print('Unique emojis: ', len(emoji_df), '\n')
emoji_df.head(10)

Unique emojis:  29 



Unnamed: 0_level_0,Cantidad
Emoji,Unnamed: 1_level_1
🤣,26
🤔,11
👍🏽,10
👀,5
🙃,4
😞,4
🫦,3
😅,3
😌,2
😘,2


### Most used emojis
Show pie chart of most used emojis

In [11]:
fig = px.pie(emoji_df, values='Cantidad', names=emoji_df.index, title='Pie de emojis')
fig.update_traces(textposition='inside', textinfo='percent+label')

# Uses font compatible with emojies
fig.update_layout(
    font=dict(
        family="Segoe UI Emoji",
        size=14,
        color="black" # font color
    )
)

fig.write_image(get_save_path(save_path, f'{chat_name}-emojies.png'))
fig.show()

### Determine the most active member(s) of the chat
Based on the messages quantity

In [12]:
df_active_members = df.groupby('Miembro')['Mensaje'].count().sort_values(ascending=False).to_frame()
df_active_members.reset_index(inplace=True)
df_active_members.index = np.arange(1, len(df_active_members)+1)
df_active_members

Unnamed: 0,Miembro,Mensaje
1,p. m. - Juanma Romano,846
2,p. m. - Anabel 🌷,682
3,a. m. - Anabel 🌷,452
4,a. m. - Juanma Romano,450


### Show most talk member(s)
Based on the messages quantity

In [13]:
fig_users = px.pie(df_active_members, values='Mensaje', names=df_active_members['Miembro'], title='Habladores')
fig_users.update_traces(textposition='inside', textinfo='percent+label')

fig_users.write_image(get_save_path(save_path, f'{chat_name}-most-talk.png'))
fig_users.show()

###  Show the number of words and letters per message for each member

In [14]:
# Separate messages (without multimedia) and multimedia (stickers, photos, videos)
options = [
    'imagen omitida', 
    'Video omitido', 
    'audio omitido', 
    'sticker omitido', 
    'Tarjeta de contacto omitida', 
    'Se eliminó este mensaje.', 
    'GIF omitido', 
    'Ubicación: https://maps.', 
    'ENCUESTA:', 
    'POOL:', 
    '<Media omitted>', 
    '<Multimedia omitido>', 
    '<This message was edited>', 
    '<Este mensaje fue editado>', 
    '<View once voice message omitted>', 
    '<Ver una vez omitido el mensaje de voz>', 
    'You deleted this message', 
    'Eliminaste este mensaje.', 
    'This message was deleted', 
    'Se eliminó este mensaje.'
]


multimedia_df = df[df['Mensaje'].isin(options)]
messages_df = df.drop(multimedia_df.index)

# Count the number of words and letters per message
messages_df['Letras'] = messages_df['Mensaje'].apply(lambda s : len(s))
messages_df['Palabras'] = messages_df['Mensaje'].apply(lambda s : len(s.split(' ')))
messages_df

fig_messages_data = px.scatter(
    messages_df, 
    x='Palabras', 
    y='Miembro', 
    size='Letras', 
    color='Miembro',  # Optional, to differentiate members by color
    hover_name='Miembro',  # Optional, to display the member's name when hovering over the bubble
    title='Relación entre cantidad de palabras y longitud de palabras por miembro'
)

fig_messages_data.write_image(get_save_path(save_path, f'{chat_name}-word-frecuency.png'))
fig_messages_data.show()





### Create a JSON objet with the members data
{
    member: [
        messages,
        words,
        multimedia,
        emojies,
        links
    ]
}

In [15]:
miembros = messages_df.Miembro.unique()

dictionary = {}

for i in range(len(miembros)):
    list_counts = []

    # Filter messages from a specific member
    member_df= messages_df[messages_df['Miembro'] == miembros[i]]

    # Add messages count
    list_counts.append(member_df.shape[0])
    
    # Words per message
    words_per_message = (np.sum(member_df['Palabras']))/member_df.shape[0]
    list_counts.append(words_per_message)

    # Multimedia messages
    multimedia = multimedia_df[multimedia_df['Miembro'] == miembros[i]].shape[0]
    list_counts.append(multimedia)

    # Count emojies
    emojis = sum(member_df['Emojis'].str.len())
    list_counts.append(emojis)

    # Count links
    links = sum(member_df['URLs'])
    list_counts.append(links)

    # Create the key/pair values for the member on the dictionary.
    dictionary[miembros[i]] = list_counts
    
print(dictionary)

{'p.\u202fm. - Anabel 🌷': [465, 5.318279569892473, 217, 3, 2], 'p.\u202fm. - Juanma Romano': [661, 6.178517397881997, 185, 54, 6], 'a.\u202fm. - Juanma Romano': [358, 5.4860335195530725, 92, 31, 1], 'a.\u202fm. - Anabel 🌷': [337, 4.732937685459941, 115, 3, 2]}


###  Show the number of words and letters per message for each member (includes multimedia messages)

In [16]:
# Convert from dictionary to dataframe
member_stats = pd.DataFrame.from_dict(dictionary)

# Change the index for the aggregate column 'Statistics'
stats = ['Mensajes', 'Palabras por mensaje', 'Multimedia', 'Emojis', 'Links']
member_stats['Estadísticas'] = stats
member_stats.set_index('Estadísticas', inplace=True)

# Transpose the dataframe
member_stats = member_stats.T

# Reset index to Member
member_stats.reset_index(inplace=True)
member_stats.rename(columns={'index': 'Miembro'}, inplace=True)

# Convert to integrate the Messages, Multimedia Emojis and Links columns
member_stats['Mensajes'] = member_stats['Mensajes'].apply(int)
member_stats['Multimedia'] = member_stats['Multimedia'].apply(int)
member_stats['Emojis'] = member_stats['Emojis'].apply(int)
member_stats['Links'] = member_stats['Links'].apply(int)
member_stats.sort_values(by=['Mensajes'], ascending=False)

fig_messages_data = px.scatter(
    member_stats, 
    x='Palabras por mensaje', 
    y='Mensajes', 
    size='Emojis', 
    color='Multimedia',  # Optional, to differentiate members by color
    hover_name='Mensajes',  # Optional, to display the member's name when hovering over the bubble
    title='Frecuencia y tipos de mensajes (con tamaño en emojies)',
    text='Miembro'
)


fig_messages_data.write_image(get_save_path(save_path, f'{chat_name}-total-word-frecuency.png'))
fig_messages_data.show()

### Generates a range_per_hour column

In [17]:
df['rangoHora'] = pd.to_datetime(df['Hora'], format='%H:%M:%S') # TODO check

# Define a function to create the "Range Hour" column
def create_range_hour(hour):
    start_hour = hour.hour
    end_hour = (hour + pd.Timedelta(hours=1)).hour
    return f'{start_hour:02d} - {end_hour:02d} h'

# # Apply the function to create the "Range Hour" column
df['rangoHora'] = df['rangoHora'].apply(create_range_hour)
df

ValueError: time data "8:01" doesn't match format "%H:%M:%S", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

### Plot the "range per hour" messages

In [18]:
# Group by 'rangoHora' and count occurrences
date_df = df.groupby('rangoHora').size().reset_index(name='# Mensajes por hora')

# Plot the number of messages against time
fig = px.line(date_df, x='rangoHora', y='# Mensajes por hora')


fig.update_layout(
    title={'text': 'Cantidad de mensajes durante del dia',
           'y':0.96,
           'x':0.5,
           'xanchor': 'center'},
    font=dict(size=12))
fig.update_xaxes(title_text='Rango de hora', tickangle=30)
fig.update_yaxes(title_text='Mensajes')

fig.write_image(get_save_path(save_path, f'{chat_name}-range-per-hour.png'))
fig.show()



KeyError: 'rangoHora'

### Plot messages frecuency for the time in days

In [None]:
# Group by 'Fecha' and count occurrences
date_df = df.groupby('Fecha').size().reset_index(name='# Mensajes por día')

fig = px.line(date_df, x='Fecha', y='# Mensajes por día')

fig.update_layout(
    title={'text': 'Cantidad de mensajes vs Tiempo',
           'y':0.96,
           'x':0.5,
           'xanchor': 'center'},
    font=dict(size=17))
fig.update_xaxes(title_text='Fecha')
fig.update_yaxes(title_text='# Mensajes')

fig.write_image(get_save_path(save_path, f'{chat_name}-time-frecuency-in-days.png'))
fig.show()

In [None]:
# Create a string that will contain all the words to avoid in the wordcloud
total_words = ' '

stop_words_list = ['que', 'qué', 'con', 'de', 'te', 'en', 'la', 'lo', 'le', 'el', 'las', 'los', 'les', 'por', 'es',
                   'son', 'se', 'para', 'un', 'una', 'chicos', 'su', 'si', 'chic','nos', 'ya', 'hay', 'esta',
                   'pero', 'del', 'mas', 'más', 'eso', 'este', 'como', 'así', 'todo', 'https','Media','omitted',
                   'y', 'mi', 'o', 'q', 'yo', 'al', 'omitido', 'jajaja', 'jajajaja', 'jajajajaja', 'jajajajajaja', 
                   'jajajajajajaja', 'añadió', 'eliminó', 'cambió', 'gif', 'vm', 'multimedia', 'null', 'sticker',
                   "ese", "esa", ""
                   ]

# Add all members to the list.
members_names = df['Miembro'].unique().tolist()
stop_words_list += members_names

stopwords = STOPWORDS.update(stop_words_list)

# Get and accumulate all the words of each message
for message in messages_df['Mensaje'].values:
    words = str(message).lower().split() # Get the words of each line of the txt
    for word in words:
        total_words = total_words + word + ' ' # Accumulate all the words

wordcloud = WordCloud(width = 1000, height = 700,
                background_color ='white',
                stopwords = stopwords,
                min_font_size = 10).generate(total_words)

# Plot the cloud of most used words
image = wordcloud.to_image()
image.save(get_save_path(save_path, f'{chat_name}-wordcloud.png'))
image

## San Valentin
Create a word cloud with love words, additional words, and custom words. The word cloud uses a custom color palette and you can use a custom font.

In [None]:

font_path = ''
font_path = 'fonts/Pacifico-Regular.ttf' #2: G

# Target words
word_target_list = [
    "Amor", "Te amo", "Amorcito", "Corazon", "Corazón", "Mi vida",
    "Te quiero", "Querida", "Querido",
    "Besos", "Besitos", "Beshitos", "Beshos", "Besos",
    "Bonito", "Bonita", "Mi cielo", "Preciosa", "Precioso", "Ángel", "Angel"
    "Gorda", "Gordo", "Chiquito", "Chiquita", "Niño", "Niña", "Muñeca", "Muñeco", "Bebé", "Bebe"
    "Princesa", "Príncipe", "Principe", "Hermosa", "Hermoso", "Mi rey", "Mi reina",
    "Cariño", "Tesoro", "Bombón", "Dulzura",
    "Preciosa", "Precioso",
    "Mi sol", "Mi rey", "Mi reina",
]

# Extra words (more hot)
word_extra = ["Sexy", "Mami", "Papi",]
word_target_list += word_extra

# Custom words for romantic nickname
custom_words = ['bby']
word_target_list += custom_words


total_words = ''
word_target_list = [word.lower() for word in word_target_list]

# Check words in dataframe (this must be initialized)
for message in df['Mensaje']:
    words = str(message).lower().split()
    for word in words:
        if word in word_target_list:
            total_words = total_words + word + ' '

# Show wordcloud
if font_path:
    wordcloud = WordCloud(width = 1000, height = 700,
                        font_path=font_path,
                        background_color = color_palette[0],
                        stopwords = stopwords,
                        color_func=custom_color,
                        min_font_size = 10).generate(total_words)
else:
    wordcloud = WordCloud(width = 1000, height = 700,
                    background_color = color_palette[0],
                    stopwords = stopwords,
                    color_func=custom_color,
                    min_font_size = 10).generate(total_words)

image = wordcloud.to_image()

# Save as a new image
image.save(get_save_path(save_path, f'{chat_name}-wordcloud-love.png'))

image