In [1]:
import plotly.express as px
import os
import pandas as pd
import re
import datetime as time
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import emoji
import json
from collections import Counter
from wordcloud import WordCloud, STOPWORDS

In [2]:
#Meta does not encode the JSON properly, this function is required to access the content since we need to encode in Latin_1 before decoding it again in UTF-8
def parse_obj(obj):
    for key in obj:
        if isinstance(obj[key], str):
            obj[key] = obj[key].encode('latin_1').decode('utf-8')
        elif isinstance(obj[key], list):
            obj[key] = list(map(lambda x: x if type(x) != str else x.encode('latin_1').decode('utf-8'), obj[key]))
        pass
    return obj

In [3]:
def load_messages(path):
    # Open the messages
    file = open(path + 'message_1.json')
    
    #Here we have the decoder from messenger
    data = json.load(file,object_hook=parse_obj)
    df = pd.json_normalize(data['messages'])
    return (df)


In [4]:
#Now, we need to clean the data

def clean_data(df):
    #We need a usefull time stamp
    df['date_time']=pd.to_datetime(df['timestamp_ms'], unit='ms') 
   
    #We will not exploit all data but just text data
    df.drop(columns=['timestamp_ms','is_unsent','photos','type','videos','audio_files','sticker.uri',
                     'call_duration','share.link','is_taken_down','bumped_message_metadata.bumped_message','bumped_message_metadata.is_bumped'],inplace=True)

    df['year']=df['date_time'].dt.year
    df['hour']=df['date_time'].dt.hour
    df['weekday']=df['date_time'].dt.weekday
    
    #We can exclude some non participing people
    df=df[~df['sender_name'].isin([''])]
    
    df['content']=df.content.fillna('')
   
    return (df)



In [5]:
path="../input/message-1/"
df=load_messages(path)
df=clean_data(df)
df.head()

In [None]:
df.info()

# **Numerical messages analyzis**

In [7]:
#Number of messages by sender

#Creating a new dataframe by copying the old dataframe
df1 = df.copy()
df1['Number_of_messages'] = [1]*df1.shape[0]
df1.drop(columns = 'date_time', inplace = True)

#Counting the number of messages by sender
df1 = df1.groupby('sender_name')['Number_of_messages'].count().sort_values(ascending = False).reset_index() 
df1

In [28]:
#Formating
sns.set_style("darkgrid")

#The background of the chart
matplotlib.rcParams['font.size'] = 12
matplotlib.rcParams['figure.figsize'] = (12, 9)
matplotlib.rcParams['figure.facecolor'] = '#00000000'
fig, ax = plt.subplots()

#Creating a bar chart
sns.barplot(df1.sender_name,df1.Number_of_messages,hue='sender_name',data=df1,dodge=False,palette="husl")
plt.title("Number of messages")

#Change the width of the bar chart plot
def change_width(ax, new_value) :
    for patch in ax.patches :
        current_width = patch.get_width()
        diff = current_width - new_value

        # we change the bar width
        patch.set_width(new_value)

        # we recenter the bar
        patch.set_x(patch.get_x() + diff * .5)

change_width(ax, .35)
plt.show()

# **Most used emojis**

In [17]:
import regex
def split_count(text):

    emoji_list = []
    data = regex.findall(r'\X', text)
    for word in data:
        if any(char in emoji.UNICODE_EMOJI['en'] for char in word):
            emoji_list.append(word)
    
    return emoji_list

In [20]:
#Number of emojis

#Copying the dataset
df2 = df.copy()

emoji_ctr = Counter()
emojis_list = map(lambda x: ''.join(x.split()), emoji.UNICODE_EMOJI.keys())
r = re.compile('|'.join(re.escape(p) for p in emojis_list))
for idx, row in df2.iterrows():

        emojis_found = r.findall(row["content"])
        for emoji_found in emojis_found:
            emoji_ctr[emoji_found] += 1
            
        
emojis_df = pd.DataFrame() 
emojis_df['Emoji'] = [''] * 10
emojis_df['Number_of_Emojis'] = [0] * 10

i = 0
for item in emoji_ctr.most_common(10):
    emojis_df.Emoji[i] = item[0]
    emojis_df.Number_of_Emojis[i] = int(item[1])
  
    i += 1

emojis_df

In [26]:

fig = px.pie(emojis_df, values='Number_of_Emojis', names='Emoji',title='Emoji percentage used in chat group')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

# **Most active hours**

In [22]:
#Most active hour in messenger
df3 = df.copy()

df3['Number_of_messages'] = [1] * df3.shape[0]

df3['hours'] = df3['date_time'].apply(lambda x: x.hour)

time_df = df3.groupby('hours').count().reset_index().sort_values(by = 'hours')


time_df

In [23]:
#Create the formatting of the graph 
matplotlib.rcParams['font.size'] = 20
matplotlib.rcParams['figure.figsize'] = (20, 8)

sns.set_style("darkgrid")

plt.title('Most active hours in Messenger');
sns.barplot(time_df.hours,time_df.Number_of_messages,data = time_df,dodge=False)

# **Most used words**

In [24]:
df5 = df.copy()

word = " ".join(review for review in df5.content)

stopwords = set(STOPWORDS)

#delete the word/text that are commonly used(eg.the,yes,no,bye,or and is)
stopwords.update(["Sbe7" , "Ey", "l5ir", "sa7a" ,"chribetkom","Eyh","W","fi","sbe7 l5ir","bonjour","réagi","votre", "à","ama","kol","bch","Inchala","Ena"])

#Creating a word cloud 
wordcloud = WordCloud(width = 500, height =500 ,stopwords=stopwords, background_color="pink",min_font_size = 10).generate(word)

plt.figure( figsize=(10,5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()


In [25]:
wordcloud.to_image()