In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
# Load WhatsApp chat exported as .txt
def load_chat(file_path):
    with open(file_path, encoding='utf-8') as f:
        lines = f.readlines()
    return lines

# Parse WhatsApp chat lines
def parse_chat(chat_lines):
    # Regex pattern to detect new message lines
    pattern = r'^(\d{1,2}/\d{1,2}/\d{2,4}), (\d{1,2}:\d{2}(?: ?[APMapm]{2})?) - (.*?): (.*)'

    messages = []
    current_msg = None

    for line in chat_lines:
        line = line.strip()

        # Check if line contains multiple timestamps (embedded message)
        embedded_matches = re.findall(pattern, line)
        if len(embedded_matches) > 1:
            # Split and process each embedded message separately
            for match in embedded_matches:
                date, time, sender, message = match
                messages.append({
                    'Date': date,
                    'Time': time,
                    'Sender': sender,
                    'Message': message
                })
            current_msg = None  # Reset
            continue

        # Normal case: single timestamp
        match = re.match(pattern, line)
        if match:
            # Save previous message
            if current_msg:
                messages.append(current_msg)
            date, time, sender, message = match.groups()
            current_msg = {
                'Date': date,
                'Time': time,
                'Sender': sender,
                'Message': message
            }
        else:
            # Continuation of previous message
            if current_msg:
                current_msg['Message'] += ' ' + line

    # Append last message
    if current_msg:
        messages.append(current_msg)

    return pd.DataFrame(messages)

In [12]:
chat_lines = load_chat("PandiBeulahWhatsup.txt")
chat_df = parse_chat(chat_lines)

In [14]:
chat_df

Unnamed: 0,Date,Time,Sender,Message
0,9/9/21,9:40 PM,Pandi,Hi Beulah
1,9/9/21,9:41 PM,Pandi,This is Karuppasamy Pandiyan
2,9/12/21,9:12 PM,Pandi,"Hi, I spoke with Pastor"
3,9/12/21,9:15 PM,Beulah,Ok
4,9/12/21,9:15 PM,Beulah,"If u r free, call me"
...,...,...,...,...
30859,9/19/25,10:21 AM,Beulah,Reached here
30860,9/19/25,10:21 AM,Pandi,Ok
30861,9/19/25,6:08 PM,Pandi,Raining here
30862,9/19/25,6:10 PM,Beulah,Super


In [16]:
# Label flirt messages manually or load pre-labeled dataset
def label_flirt_messages(df):
    flirt_keywords = ['baby', 'sweetheart', 'miss you', 'love', 'kiss', 'hot', 'cute', 'handsome','kiss','hug','date', 'cute',
                       'beautiful', 'sexy', 'hot','adorable','uma', 'darling',
                       'fuck','porn', 'x', 'sex', 'matter', 'nipple', 'virgin', 'sperm',
                       'seduce', 'condom','kk']
    df['Flirt'] = df['Message'].apply(lambda x: 1 if any(word in x.lower() for word in flirt_keywords) else 0)
    return df

# Train flirt prediction model
def train_flirt_model(df):
    vectorizer = TfidfVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['Message'])
    y = df['Flirt']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = LogisticRegression()
    model.fit(X_train, y_train)
    print("Model Performance:\n", classification_report(y_test, model.predict(X_test)))
    return model, vectorizer

# Predict flirt messages
def predict_flirt(df, model, vectorizer):
    X = vectorizer.transform(df['Message'])
    df['Flirt_Predicted'] = model.predict(X)
    return df

In [18]:
chat_df = label_flirt_messages(chat_df)
model, vectorizer = train_flirt_model(chat_df)
chat_df = predict_flirt(chat_df, model, vectorizer)

Model Performance:
               precision    recall  f1-score   support

           0       0.97      1.00      0.98      5850
           1       1.00      0.40      0.57       323

    accuracy                           0.97      6173
   macro avg       0.98      0.70      0.78      6173
weighted avg       0.97      0.97      0.96      6173



In [20]:
# Talkative vs Less Talkative
def talkativeness(df):
    talk_counts = df['Sender'].value_counts()
    print("\nTalkativeness:\n", talk_counts)
    return talk_counts

In [22]:
 talkativeness(chat_df)


Talkativeness:
 Sender
Beulah    17911
Pandi     12953
Name: count, dtype: int64


Sender
Beulah    17911
Pandi     12953
Name: count, dtype: int64

In [24]:
# Most Active Day and Time
def activity_analysis(df):
    # Clean and parse date
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=False)

    # Normalize time strings (remove non-breaking spaces and fix AM/PM)
    df['Time'] = df['Time'].str.replace('\u202F', ' ', regex=True).str.replace('\xa0', ' ', regex=True)
    df['Hour'] = pd.to_datetime(df['Time'], format='%I:%M %p', errors='coerce').dt.hour

    # Drop rows with invalid dates or hours
    df = df.dropna(subset=['Date', 'Hour'])

    # Compute most active day and hour
    active_day = df['Date'].dt.day_name().value_counts().idxmax()
    active_hour = df['Hour'].value_counts().idxmax()

    print(f"\nMost Active Day: {active_day}")
    print(f"Most Active Hour: {active_hour}")
    return active_day, active_hour

In [26]:
activity_analysis(chat_df)


Most Active Day: Tuesday
Most Active Hour: 23


  df['Date'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=False)


('Tuesday', np.int32(23))

In [28]:
# Media Count
def media_count(df):
    media_tag = '<Media omitted>'
    media_stats = df[df['Message'] == media_tag]['Sender'].value_counts()
    print("\nMedia Count:\n", media_stats)
    return media_stats

In [30]:
 media_count(chat_df)


Media Count:
 Sender
Pandi     3307
Beulah    1723
Name: count, dtype: int64


Sender
Pandi     3307
Beulah    1723
Name: count, dtype: int64

In [32]:
# Missed Calls
def missed_calls(df):
    missed_call_keywords = ['missed voice call', 'missed video call']
    missed = df[df['Message'].str.lower().str.contains('|'.join(missed_call_keywords))]
    print("\nMissed Calls:\n", missed[['Date', 'Time', 'Sender', 'Message']])
    return missed

In [34]:
missed_calls(chat_df)


Missed Calls:
             Date      Time  Sender            Message
655   2021-10-01   3:02 PM  Beulah  Missed voice call
1673  2021-10-19  10:10 PM  Beulah  Missed voice call
1674  2021-10-19  10:11 PM  Beulah  Missed voice call
1675  2021-10-19  10:11 PM  Beulah  Missed voice call
1676  2021-10-19  11:18 PM  Beulah  Missed voice call
...          ...       ...     ...                ...
24077 2023-09-21   7:31 AM  Beulah  Missed video call
24102 2023-09-22   9:15 AM  Beulah  Missed video call
24126 2023-09-23   8:48 AM  Beulah  Missed video call
24262 2023-10-20  12:28 PM  Beulah  Missed video call
24307 2023-10-21   8:24 AM  Beulah  Missed video call

[299 rows x 4 columns]


Unnamed: 0,Date,Time,Sender,Message,Flirt,Flirt_Predicted,Hour
655,2021-10-01,3:02 PM,Beulah,Missed voice call,0,0,15
1673,2021-10-19,10:10 PM,Beulah,Missed voice call,0,0,22
1674,2021-10-19,10:11 PM,Beulah,Missed voice call,0,0,22
1675,2021-10-19,10:11 PM,Beulah,Missed voice call,0,0,22
1676,2021-10-19,11:18 PM,Beulah,Missed voice call,0,0,23
...,...,...,...,...,...,...,...
24077,2023-09-21,7:31 AM,Beulah,Missed video call,0,0,7
24102,2023-09-22,9:15 AM,Beulah,Missed video call,0,0,9
24126,2023-09-23,8:48 AM,Beulah,Missed video call,0,0,8
24262,2023-10-20,12:28 PM,Beulah,Missed video call,0,0,12
