## **WhatsApp Web Scrapping using [simon](https://pypi.org/project/whatsapp-web/)**

- possible helpful: https://geeknizer.com/read-extract-whatsapp-messages-android-iphone-blackberry/
- infinite scrolling: https://pyautogui.readthedocs.io/en/latest/mouse.html#mouse-scrolling

In [26]:
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import StaleElementReferenceException

from simon.accounts.pages import LoginPage
from simon.header.pages import HeaderPage
from simon.chat.pages import ChatPage
from simon.chats.pages import PanePage

from bs4 import BeautifulSoup
import pprint
import os, sys
import smtplib
import pandas as pd
import datetime

PATH = "data/"

def login():
    # Web scrapper for infinite scrolling page 
    driver = webdriver.Chrome(executable_path=r"C:\\Users\\jakob\\chromedriver.exe")
    #driver.get("https://web.whatsapp.com/")
    time.sleep(2)  # Allow 2 seconds for the web page to open

    login_page = LoginPage(driver)
    login_page.load()
    input("Please use the QR-Code")
    return driver

def get_delta_chat(driver):
    # get all chats
    all_chats = driver.find_elements_by_class_name("_210SC") 
    # find DELTA
    for chat in all_chats:
        soup = BeautifulSoup(chat.text, "html.parser")
        if "DELTA" in soup.text:
            print("Found DELTA")
            driver.execute_script("arguments[0].scrollIntoView();", chat)
            delta_chat = chat
            break
    # click to focus
    delta_chat.click()
    return delta_chat

def scroll(n_scrolls):
    for i in range(n_scrolls):
        actions = ActionChains(driver)
        actions.send_keys(Keys.UP*5)
        actions.perform()
        time.sleep(1)
    
def create_datetime_from_msg(msg):
    
    time_and_autor = msg.raw_datetime_and_contact()

    t, a = time_and_autor.split("] ")
    author = a[:-2]
    time_, date = t[1:].split(", ")

    dt = datetime.datetime.strptime(time_+","+date, "%H:%M,%d.%m.%Y")
    return dt
        
def check_if_msg_already_in_df(msgs, dt_last_in_df):
    counter = 0
    for i, msg in enumerate(msgs):
        # print(f"Message nr. {i+1}")
        try:
            dt = create_datetime_from_msg(msg)
            # print(dt)
        except AttributeError:
            # print("Exception")
            dt = datetime.datetime.now()
            continue
            
        if dt == dt_last_in_df:
            print(f"Message on {dt.strftime('%H:%M, %d:%m:%y')} already in database -> break!")
            stop = True
            break
        else:
            stop = False
        counter += 1
    return stop, counter  

def merge_dataframes(msgs, chat_df, dt_last_in_df, save_=True):
    rows = []
    for i, msg in enumerate(msgs):
        try:
            dt = create_datetime_from_msg(msg)
            text = msg.text
            if text == None:
                text = "pic-or-gif"
            d1 = dict(Time=dt.strftime("%H:%M"), Date=dt.strftime("%d.%m.%y"), Author=msg.contact, Text=text, Text_lower=text.lower())
        except:
            pass
        if dt == dt_last_in_df:
            print(f"Message on {dt.strftime('%H:%M, %d:%m:%y')} already in database -> break!")
            stop = True
            break
        else:
            rows.append(d1)
    df_ = pd.DataFrame(rows)
    df_c = pd.concat([chat_df, df_.sort_index(ascending=False)], ignore_index=True)
    if save_:
        save_time = datetime.datetime.fromtimestamp(time.time())
        fname = f"chat_{save_time.strftime('%d_%m_%y')}.csv"
        print("Save Chat to ", PATH+fname)
        df_c.to_csv(fname)
    return df_c

In [15]:
# get the chat dataframe and the last message
chat_df = pd.read_csv("data/chat_04_10_20.csv", index_col="idx")
col_names = chat_df.columns
last_message_time_and_date = chat_df.iloc[-1]["Time"], chat_df.iloc[-1]["Date"]
datetime_lm = datetime.datetime.strptime(last_message_time_and_date[0]+","+last_message_time_and_date[1], "%H:%M,%d.%m.%y")

In [16]:
# login to whatsapp
driver = login()
time.sleep(1)

Please use the QR-Code


In [17]:
# get the delta chat
delta_chat = get_delta_chat(driver=driver)
time.sleep(0.1)

Found DELTA


In [18]:
def parse_chat(driver, datetime_lm, chat_df)
    # find any message and click it
    msg = driver.find_element_by_xpath("/html/body/div[1]/div/div/div[4]/div/div[3]/div/div/div[3]/div[13]/div")
    msg.click()
    time.sleep(1)
    # scroll through the chat and check if the messages are already in the dataframe
    stop_scrolling = False
    while not stop_scrolling:

        scroll(n_scrolls=5)    
        # use simon to extract all messages
        chatPage = ChatPage(driver)
        msgs = chatPage.messages.all()

        stop_scrolling, counter = check_if_msg_already_in_df(msgs, dt_last_in_df=datetime_lm)
        print("MESSAGE COUNTER AT ".ljust(50, "="), counter)  
    time.sleep(1)
    # generate the new dataframe and save it
    df = merge_dataframes(msgs=msgs, chat_df=chat_df, dt_last_in_df=datetime_lm, save_=True)
    return df

Message on 10:40, 04:10:20 already in database -> break!
Message on 10:40, 04:10:20 already in database -> break!
Save Chat to  chat_29_10_20.csv


In [27]:
mgl = pd.read_csv(PATH+"mgl_list.csv")


In [29]:
for email in mgl["Email"]:
    print(email)

patzegg@gmail.com
cooper_@gmx.at
p.gerges21@gmail.com
stefannego93@gmail.com
weberjakob64@gmail.com


In [43]:
df_s = pd.read_csv(PATH+"sitzungen.csv")
date = datetime.datetime(year=2020, month=12, day=2)
df_s["Date"]= pd.to_datetime(df_s["Date"], format="%d.%m.%y")
new_dates = df_s[df_s["Date"] > date]

In [52]:
df_s

Unnamed: 0,Date,Type,Chair
0,2019-06-17,Rat,Jakob
1,2019-09-11,Rat,Peter
2,2019-11-22,Rat,Andre
3,2020-03-13,Rat,Nego
4,2020-06-06,Rat,Jakob
5,2020-09-16,Rat,Peter
6,2020-11-04,Budget,Andre


In [59]:
import time
import datetime

def check_sitzungen():
    """Read the file sitzungen.csv and check if there are planned Sitzungen. """
    df_s = pd.read_csv(PATH+"sitzungen.csv")
    date = datetime.datetime.now()
    df_s["Date"]= pd.to_datetime(df_s["Date"], format="%d.%m.%y")
    new_dates = df_s[df_s["Date"] > date]
    if not new_dates.empty:
        print("We have planned meetings!")
    return new_dates
    
def send_mails(dates, send_to=False):
    
    smtpObj = smtplib.SMTP("smtp.gmail.com", 587)
    smtpObj.ehlo()
    smtpObj.starttls()
    pw = input("Insert Password")
    smtpObj.login("weberjakob64@gmail.com", password=pw)

    body = f"Subject: Reminder: Naechste Sitzung. Die naechste Sitzung ist von Type {new.iloc[-1]['Type']} am {new.iloc[-1]['Date'].strftime('%d.%m.%y')}"
    if send_to:
        print("Sending email to %s..."%send_to)
        sendmailStatus = smtpObj.sendmail("weberjakob64@gmail.com", send_to, body.encode("utf-8"))
        if sendmailStatus != {}:
            print(f"There was a problem sending mail to {email}:{sendmailStatus}")
    else:
        mgl = pd.read_csv(PATH+"mgl_list.csv")
        for email in mgl["Email"]:
            print("Sending email to %s..."%email)
            sendmailStatus = smtpObj.sendmail("weberjakob64@gmail.com", email, body.encode("utf-8"))
            if sendmailStatus != {}:
                print(f"There was a problem sending mail to {email}:{sendmailStatus}")
    smtpObj.quit()
    print("Finished")
    

In [60]:
new = check_sitzungen()

We have planned meetings!


In [61]:
new

Unnamed: 0,Date,Type,Chair
6,2020-11-04,Budget,Andre


In [62]:
send_mails(dates=new, send_to="jakobweber@hotmail.com")

Insert PasswordGo_caramellatte_01
Sending email to jakobweber@hotmail.com...
Finished
