### Telegram chat analysis
#### author: Luis Durazo (github.com/ldurazo)

This project will analyze any telegram chat and give you some curious, fun and sometimes meaningful information

First, load the dataframe from the data folder.

In [None]:
import re
from collections import Counter
import emoji
import numpy
import pandas as pd
from matplotlib import pyplot
from wordcloud import WordCloud, STOPWORDS
df = pd.read_json('../data/telegram.json', dtype={'from_id': str})
df.info()

Get all unique users.

In [None]:
df.get(['from_id', 'from']).groupby(['from_id']).apply(print)

TODO ldurazo: need to find a way to hide this away more effectively.
2020/10/2 Observations with Telegram group:
- id: x (Ma) - 29298
- id: x (Unknown) - 4
- id: x (A) - 34283
- id: x (C) - 6201
- id: x (D) - 10191
- id: x (R) - 17116
- id: x (I) - 32889
- id: x (Mi) - 18727
- id: x (Unknown) - 5817
- id: x (L) - 15242
- id: x (X) - 3771
- id: nan (NaN) - 70

With the data above, time to uncover the unknowns so that we keep only the name without the id.
By looking at a few data samples the message source were easy to determine:

In [None]:
df.loc[df.from_id == 'x.0', 'from'] = "L"
df.loc[df.from_id == 'x.0', 'from'] = "W"
df.loc[df.from_id == 'x.0', 'from'] = "X"
df = df[df.from_id != 'nan'] # This is a telegram service, likely updates

Now remove the from_id table, as we have the names:

In [None]:
df = df.drop('from_id', axis=1)
df.info()

Let's take a look at the data again, and see who has the most messages

In [None]:
df[['type','from']].groupby(['from']).count().sort_values(['type'], ascending=False)

What are the message types and their counts?

In [None]:
df[['media_type', 'id']].groupby('media_type', as_index=False).count()

By message type, let's see some charts.

In [None]:
voice_df = df.loc[df['media_type'] == 'voice_message'][['from', 'id']]\
    .groupby(['from'], as_index=False)\
    .agg('count')\
    .sort_values(['id'], ascending=False)

import plotly.express as px
fig = px.pie(voice_df, values=voice_df['id'], names=voice_df['from'],
             title='Voice messages per person')
fig.update_traces(textposition='inside', textinfo='value+label+percent')
fig.show()

sticker_df = df.loc[df['media_type'] == 'sticker'][['from', 'id']]\
    .groupby(['from'], as_index=False)\
    .agg('count')\
    .sort_values(['id'], ascending=False)

import plotly.express as px
fig = px.pie(sticker_df, values=sticker_df['id'], names=sticker_df['from'],
             title='Stickers sent')
fig.update_traces(textposition='inside', textinfo='value+label+percent')
fig.show()


Now we are going to see information about emojis and words used.

In [None]:
def get_emojis_in_message(row):
    message = row.text
    emojis = ""
    # Telegram may save some messages as json
    if message is None or type(message) != str:
        return None
    return emojis.join(char for char in message if char in emoji.UNICODE_EMOJI)

def get_words_count(row):
    message = row.text
    emojis = ""
    # Telegram may save some messages as json
    if message is None or type(message) != str:
        return None
    return re.sub("[^\w]", " ",  message).split().__len__()

df["emojis"] = df[["text"]].apply(get_emojis_in_message, axis=1)
df["word_count"] = df[["text"]].apply(get_words_count, axis=1)
# df[["text", "from", "id", "emojis", "word_count"]].sort_values(['emojis'], ascending=False)

Let's get some data about our friends:

In [None]:
people = df['from'].unique()

for name in people:
    user_df = df[df["from"] == name]
    words_per_message = numpy.sum(user_df['word_count'])
    print('stats for ', name)
    print(name,' sent  ', int(words_per_message), ' words, average ', words_per_message/user_df.shape[0], ' per message')

In [None]:
total_emojis_list = list(df.emojis)
emoji_dict = dict(Counter(total_emojis_list))
emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)

emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
emoji_df.replace(to_replace='None', value=numpy.nan).dropna()
emoji_df.replace(to_replace=0, value=numpy.nan).dropna()

import plotly.express as px
fig = px.pie(emoji_df.loc[2:].head(60), values='count', names='emoji',
             title='Emoji Distribution')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

Let's find out about the per user emoji usage

In [None]:
people = df['from'].unique()

for name in people:
    user_df = df[df["from"] == name]
    total_emojis_list = list(user_df.emojis)
    emoji_dict = dict(Counter(total_emojis_list))
    emoji_dict = sorted(emoji_dict.items(), key=lambda x: x[1], reverse=True)

    emoji_df = pd.DataFrame(emoji_dict, columns=['emoji', 'count'])
    emoji_df.replace(to_replace='None', value=numpy.nan).dropna()
    emoji_df.replace(to_replace=0, value=numpy.nan).dropna()

    import plotly.express as px
    fig = px.pie(emoji_df.loc[2:].head(60), values='count', names='emoji',title='Emoji Distribution for ' + name)
    fig.update_traces(textposition='inside', textinfo='percent+label')
    fig.show()

Let's now check for a word cloud of all of our messages.

In [None]:
text_df = df.text.dropna()
text = " ".join(review for review in df.text.dropna() if review is not None and type(review) == str)
print ("There are {} words in all the messages.".format(len(text)))

stopwords = set(STOPWORDS)
# Additionally, there is a file in this repo that contains the spanish stopwords that can be appended to the wordcloud library
stopwords.update(["pero", "en", "que", "lo", "de", "si", "con","jajaja","Jajajaja","se","tengo","por", "la", "el", "ya", "los", "es", "tiene", "como","mi","te","un","esta","del", "tu", "Yo","eso", "pue","para","las","porque","al","Jajajajaja","bueno","al","donde","ese","son","una","jaja","ese","sí","son","le","está","estaba","dice","creo"])
# Generate a word cloud image
wordcloud = WordCloud(stopwords=stopwords, background_color="white").generate(text)
# Display the generated image:
# the matplotlib way:

pyplot.figure( figsize=(10,5))
pyplot.imshow(wordcloud, interpolation='bilinear')
pyplot.axis("off")
pyplot.show()