## Whatsapp data analyzing
The first cell below simply load some helper functions

In [None]:
import os
import re
import json
from datetime import datetime

date_format = "%d/%m/%Y, %H:%M:%S"

def list_txt_files(directory_path):
    txt_files = [file for file in os.listdir(directory_path) if file.endswith(".txt")]
    return txt_files

def read_txt_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    return content

def extract_data_from_txt(file_path):
    messages = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:            
             # Regular expression patterns to extract date, sender, and message
            date_pattern = r'\[(.*?)\]'
            sender_pattern =  r'\](.*?):'
            message_pattern = r': \s*(.*)'
            
            date_match = re.search(date_pattern, line)
            sender_match = re.search(sender_pattern, line)
            message_match = re.search(message_pattern, line)
            
            if date_match and sender_match and message_match:
                date = date_match.group(1)
                sender = sender_match.group(1)
                message = message_match.group(1)
                
                message_data = {
                    "date": date.strip(),
                    "sender": sender.strip(),
                    "message": message.strip()
                }
                messages.append(message_data)
    return messages

# Get the notebook's directory
notebook_directory = os.path.dirname(os.path.abspath("__file__"))

# Assuming the /data directory is located in the same parent directory as the notebook
data_directory = os.path.join(notebook_directory, "")
available_txt_files = list_txt_files(data_directory)

print("Available .txt files in the /data directory:")
jsondata=[]
for file in available_txt_files:
    print(file)
    file_path = os.path.join(data_directory, file)
    data = extract_data_from_txt(file)
    json_data = json.dumps(data, indent=2)


#### Number of messages sent

In [None]:
print(f'There are  {len(data)} messages sent')

#### Converting the data into a dataframe for better analysis

In [None]:
import pandas as pd

df = pd.DataFrame(data)
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y, %H:%M:%S', errors='coerce')
df.set_index('date', inplace=True)

#### Simply view the dataframe to ensure everything is ok

In [None]:
df

#### Get the most active whatsapp users by number of chats

In [None]:
df.groupby(['sender']).count().sort_values(by='message',ascending=False).head(10)

#### Get the least active whatsapp users by number of chats

In [None]:
df.groupby(['sender']).count().sort_values(by='message',ascending=False).tail(10)

#### Get the number of messages sent every month for the period of the exported chat

In [None]:
messages_per_month = df.resample('M').size()

# Print the result
print(messages_per_month)

#### Visualizing the messages sent per month

In [None]:
import matplotlib.pyplot as plt
# Create a bar plot
plt.figure(figsize=(10, 6))
messages_per_month.plot(kind='line', color='skyblue')
plt.xlabel('Month')
plt.ylabel('Number of Messages')
plt.title('Number of Messages Sent per Month')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the plot
plt.show()

#### Calculate the rate of decrease

In [None]:
# Calculate the rate of decrease
rate_of_decrease = messages_per_month.pct_change().dropna() * 100
rate_of_decrease

#### Visualize the number of messages sent individually by members of the chat

In [None]:
# Group the DataFrame by 'Sender' and count the number of messages for each sender
sender_message_counts = df.groupby('sender').size()

# Select the top senders with the highest message counts
top_senders = sender_message_counts.nlargest(10)  # Change '5' to the desired number of top senders

# Filter the DataFrame to include only messages from the top senders
df_top_senders = df[df['sender'].isin(top_senders.index)]

# Group the filtered DataFrame by month and count the number of messages for each month and sender
messages_per_month_sender = df_top_senders.groupby(['sender', pd.Grouper(freq='M')]).size()

# Create a line plot for the number of messages sent by the individual senders
plt.figure(figsize=(10, 6))

for sender in top_senders.index:
    messages_per_sender = messages_per_month_sender[sender]
    plt.plot(messages_per_sender.index, messages_per_sender.values, marker='o', label=sender)

plt.xlabel('Month')
plt.ylabel('Number of Messages')
plt.title('Number of Messages Sent by Individual Senders per Month')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.legend(loc='upper left')
plt.tight_layout()

# Display the plot
plt.show()

#### Visualize the messages sent per hour in the chat

In [None]:
# Group the DataFrame by hour (regardless of the date) and count the number of messages sent for each hour
messages_per_hour = df.groupby(df.index.hour).size()

print(messages_per_hour)

#### Visualize the data

In [None]:
# Create a figure and axis object
fig, ax = plt.subplots(figsize=(12, 6))

# Create a bar plot for the number of messages sent per hour
messages_per_hour.plot(kind='bar', color='skyblue', ax=ax)
ax.set_xlabel('Hour')
ax.set_ylabel('Number of Messages')
ax.set_title('Number of Messages Sent per Hour (Across All Days)')
ax.set_xticks(range(24))
ax.grid(axis='y', linestyle='--', alpha=0.7)

# Plot a line on the same graph to show the trend or pattern of message counts
ax.plot(messages_per_hour.index, messages_per_hour.values, marker='o', color='orange', linestyle='dashed', label='Trend')
ax.legend(loc='upper right')

# Display the plot
plt.tight_layout()
plt.show()

#### Get the number of messages sent on the weekday

In [None]:
# Group the DataFrame by weekday and count the number of messages sent for each weekday
messages_per_weekday = df.groupby(df.index.weekday).size()

# Rename the weekdays to their respective names
weekdays_names = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
messages_per_weekday.index = weekdays_names


print(messages_per_weekday)

#### Visualize the data

In [None]:
# Create a bar plot for the number of messages sent on each weekday
plt.figure(figsize=(10, 6))
messages_per_weekday.plot(kind='bar', color='skyblue')
plt.xlabel('Weekday')
plt.ylabel('Number of Messages')
plt.title('Number of Messages Sent on Weekdays')
plt.xticks(rotation=45)
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.tight_layout()

# Display the plot
plt.show()

#### Download nltk for simple sentiment analysis

In [None]:
import nltk
nltk.download('vader_lexicon')



#### Run sentiment analysis on the messages

In [None]:
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Sentiment Analysis function
def perform_sentiment_analysis(text):
    analyzer = SentimentIntensityAnalyzer()
    sentiment_score = analyzer.polarity_scores(text)
    return sentiment_score

# Apply sentiment analysis to the 'Message' column
df['Sentiment_Scores'] = df['message'].apply(perform_sentiment_analysis)

# Extract the compound sentiment score from the result
df['Sentiment'] = df['Sentiment_Scores'].apply(lambda x: 'Positive' if x['compound'] >= 0 else 'Negative')

# Display the DataFrame with sentiment analysis results
print(df[['message', 'Sentiment']])

In [None]:
df

#### View the distribution of the sentiment

In [None]:
sentiment_counts = df.groupby('Sentiment').size()

# Display the sentiment counts
print(sentiment_counts)

##### Visualize the data

In [None]:
# Group the DataFrame by sentiment and count the occurrences of positive and negative sentiments
sentiment_counts = df.groupby('Sentiment').size()

# Create a pie chart to visualize the sentiment distribution with percentages
plt.figure(figsize=(6, 6))
plt.pie(sentiment_counts, labels=sentiment_counts.index, autopct='%1.1f%%', colors=['skyblue', 'lightcoral'], startangle=140)
plt.title('Sentiment Distribution')
plt.axis('equal')

# Display the plot
plt.show()

In [None]:
df