# junk_checker

> This module checks the domain_age of an email address

In [None]:
# These are emails from the last year that I have answered to. They are by definition good.
with open('../good_emails.txt', 'r') as f:
    good_emails = f.read().split(', ')

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
import whois
from datetime import datetime
import tldextract

In [None]:
#| export
def extract_domain(email):
    domain = email.split('@')[1]
    # Extract main domain from subdomain
    main_domain = tldextract.extract(domain)
    return f"{main_domain.domain}.{main_domain.suffix}"

In [None]:
#| export
# This function gets the domain age. It's less useful than expected as not all domains reveal their age on whois. Nevertheless, it could be used to remove emails whose creation date is very young
def get_domain_age(email):
    domain = extract_domain(email)
    try:
        w = whois.whois(domain)
        creation_date = w.creation_date[0] if isinstance(w.creation_date, list) else w.creation_date
        #if creation_date == None:
        #    creation_date = w.updated_date[0] if isinstance(w.updated_date, list) else w.updated_date
        if creation_date:
            domain_age = (datetime.now() - creation_date).days
            return domain_age/365
        else:
            return None
    except Exception as e:
        return str(e)

In [None]:
get_domain_age("test@cognition-behaviour.com")

2.213698630136986

In [None]:
#| export
def check_domain_age(email):
    domain = extract_domain(email)
    age_in_years = get_domain_age(domain)
    return age_in_years

In [None]:
check_domain_age("hilmar.zech@tu-dresden.de")

tu-dresden.de
Error trying to connect to socket: closing socket - [Errno 8] nodename nor servname provided, or not known


In [None]:
whois.whois("cognition-behaviour.com")

{'domain_name': ['COGNITION-BEHAVIOUR.COM', 'cognition-behaviour.com'],
 'registrar': 'GoDaddy.com, LLC',
 'whois_server': 'whois.godaddy.com',
 'referral_url': None,
 'updated_date': [datetime.datetime(2023, 7, 3, 5, 8, 52),
  datetime.datetime(2023, 7, 3, 0, 8, 50)],
 'creation_date': [datetime.datetime(2021, 7, 7, 9, 36, 53),
  datetime.datetime(2021, 7, 7, 4, 36, 53)],
 'expiration_date': [datetime.datetime(2024, 7, 7, 9, 36, 53),
  datetime.datetime(2024, 7, 7, 4, 36, 53)],
 'name_servers': ['NS41.DOMAINCONTROL.COM', 'NS42.DOMAINCONTROL.COM'],
 'status': ['clientDeleteProhibited https://icann.org/epp#clientDeleteProhibited',
  'clientRenewProhibited https://icann.org/epp#clientRenewProhibited',
  'clientTransferProhibited https://icann.org/epp#clientTransferProhibited',
  'clientUpdateProhibited https://icann.org/epp#clientUpdateProhibited'],
 'emails': 'abuse@godaddy.com',
 'dnssec': 'unsigned',
 'name': 'Registration Private',
 'org': 'Domains By Proxy, LLC',
 'address': ['Domai

In [None]:
whois.whois("ukw.de")

{'domain_name': 'ukw.de',
 'status': 'connect',
 'updated_date': datetime.datetime(2022, 1, 25, 14, 26, 42),
 'name': None,
 'org': None,
 'address': None,
 'registrant_postal_code': None,
 'city': None,
 'country_code': None,
 'phone': None,
 'fax': None,
 'name_servers': ['dns-1.dfn.de', 'dns-3.dfn.de'],
 'emails': None}

In [None]:
whois.whois("tu-dresden.de")

{'domain_name': 'tu-dresden.de',
 'status': 'connect',
 'updated_date': datetime.datetime(2018, 1, 15, 17, 13, 51),
 'name': None,
 'org': None,
 'address': None,
 'registrant_postal_code': None,
 'city': None,
 'country_code': None,
 'phone': None,
 'fax': None,
 'name_servers': ['adns1.zih.tu-dresden.de 141.30.67.53',
  'adns2.zih.tu-dresden.de 141.76.32.53',
  'dns-1.dfn.de',
  'dns-3.dfn.de'],
 'emails': None}

In [None]:
whois.whois("fsw.leidenuniv.nl")

{'domain_name': 'leidenuniv.nl',
 'expiration_date': None,
 'updated_date': datetime.datetime(2023, 6, 29, 0, 0),
 'creation_date': datetime.datetime(1988, 10, 27, 0, 0),
 'status': 'active',
 'registrar': 'team.blue nl B.V.',
 'registrar_address': 'Vondellaan 47',
 'registrar_postal_code': '2332AA',
 'registrar_city': 'Leiden',
 'registrar_country': 'Netherlands',
 'dnssec': 'yes',
 'name_servers': ['ns1.surfnet.nl', 'ns2.surfnet.nl', 'ns3.surfnet.nl']}

In [None]:
#| export
def likely_spam(email):
    return check_domain_age(email)<3
likely_spam("jack_nathan@cognition-behaviour.com")

True

In [None]:
likely_spam("jack_nathan@cognition-behaviour.com")

False

## Testing

In [None]:
#| export
import imaplib
import email
from email.header import decode_header
from datetime import datetime
import os

In [None]:
#| export
class EmailObject:
    def __init__(self, sender, subject, email_id):
        self.sender = sender
        self.subject = subject
        self.email_id = email_id

class EmailClient:
    def __init__(self, server, port, username, password):
        self.mail = imaplib.IMAP4_SSL(server, port)
        self.mail.login(username, password)

    def get_emails(self, since_date=None):
        self.mail.select('"Sent Items"')
        
        if since_date:
            since_date_str = since_date.strftime('%d-%b-%Y')
            status, messages = self.mail.search(None, f'SINCE {since_date_str}')
        else:
            status, messages = self.mail.search(None, "ALL")
            
        email_ids = messages[0].split()
        email_objects = []
        
        for e_id in email_ids:
            status, msg_data = self.mail.fetch(e_id, "(RFC822)")
            msg = email.message_from_bytes(msg_data[0][1])
            
            # Decode subject
            subject, encoding = decode_header(msg["Subject"])[0]
            if isinstance(subject, bytes):
                subject = subject.decode(encoding if encoding else "utf-8")
            
            # Extract sender
            sender = msg["To"]
            
            # Create email object and add to list
            email_objects.append(EmailObject(sender, subject, e_id))
        
        return email_objects


    def logout(self):
        self.mail.logout()

In [None]:
from email.utils import parseaddr

In [None]:


import ssl
import OpenSSL
def check_new(mail):
    
    try:
        domain = mail.split('@')[1]
        domain = extract_main_domain(domain)
        cert = ssl.get_server_certificate((domain, 443))
        x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_PEM, cert)
        return False
        #print(x509.get_notBefore().decode('utf-8'))

    except:
        return True

In [None]:
ssl.get_server_certificate(("bham.ac.uk", 443))

ConnectionRefusedError: [Errno 61] Connection refused

In [None]:
', '.join(good_emails)

', justin.hachenberger@uni-bielefeld.de, OWOODSON@PARTNERS.ORG, morgenstern@ift-nord.de, anika.busch@stud-mail.uni-wuerzburg.de, julia.goestl1@tu-dresden.de, scholz_v@ukw.de, a.f.van.meer@fsw.leidenuniv.nl, Aster_H@ukw.de, Gregorova_K@ukw.de, bounce+unsubscribe_63a37d357b3c5879176835@expertsengage.org, Miriam.Sebold@th-ab.de, l.p.v.jacobs.2@umail.leidenuniv.nl, WenzelJ@rki.de, johannes.steffen@tu-dresden.de, Waltmann_M@ukw.de, burhan.ok@ub.ac.id, Tina.Waschke@zi-mannheim.de, XGE.Coverage@apa.org, poststelle-wue@lff.bayern.de, timpm@udel.edu, zech_h@ukw.de, dillenlfvan@fsw.leidenuniv.nl, sophie.richter5@tu-dresden.de, Bernd.Lenz@zi-mannheim.de, hans_benedikt.wolf@tu-dresden.de, CRetzlaff@apa.org, marco.meixner@tu-dresden.de, chris.payne@quantum-soup.com, n.kurtenbach@fsw.leidenuniv.nl, lottevandillen@gmail.com, l.zhang.13@bham.ac.uk, Bjoern.Gerlach@zi-mannheim.de, melanie.klapprott@uni-oldenburg.de, gianna.spitta@charite.de, d.s.veldhuijzen@fsw.leidenuniv.nl, a.skvortsova@fsw.leidenuniv

In [None]:
#| notest

USERNAME = os.environ.get("EXCHANGE_USER")
PASSWORD = os.environ.get("EXCHANGE_PASSWORD")

# Usage example
email_client = EmailClient("msx.tu-dresden.de", 993, USERNAME, PASSWORD)

# Fetch emails since a specific date
since_date = datetime(2022, 8, 22)  # Replace with actual date
emails = email_client.get_emails(since_date=since_date)

good_emails = []
# Print and move emails
for e in emails:
    good_emails.append(parseaddr(e.sender)[1])

# Logout to close the connection
email_client.logout()

good_emails = list(set(good_emails))
print(good_emails)

['', 'justin.hachenberger@uni-bielefeld.de', 'OWOODSON@PARTNERS.ORG', 'morgenstern@ift-nord.de', 'anika.busch@stud-mail.uni-wuerzburg.de', 'julia.goestl1@tu-dresden.de', 'scholz_v@ukw.de', 'a.f.van.meer@fsw.leidenuniv.nl', 'Aster_H@ukw.de', 'Gregorova_K@ukw.de', 'bounce+unsubscribe_63a37d357b3c5879176835@expertsengage.org', 'Miriam.Sebold@th-ab.de', 'l.p.v.jacobs.2@umail.leidenuniv.nl', 'WenzelJ@rki.de', 'johannes.steffen@tu-dresden.de', 'Waltmann_M@ukw.de', 'burhan.ok@ub.ac.id', 'Tina.Waschke@zi-mannheim.de', 'XGE.Coverage@apa.org', 'poststelle-wue@lff.bayern.de', 'timpm@udel.edu', 'zech_h@ukw.de', 'dillenlfvan@fsw.leidenuniv.nl', 'sophie.richter5@tu-dresden.de', 'Bernd.Lenz@zi-mannheim.de', 'hans_benedikt.wolf@tu-dresden.de', 'CRetzlaff@apa.org', 'marco.meixner@tu-dresden.de', 'chris.payne@quantum-soup.com', 'n.kurtenbach@fsw.leidenuniv.nl', 'lottevandillen@gmail.com', 'l.zhang.13@bham.ac.uk', 'Bjoern.Gerlach@zi-mannheim.de', 'melanie.klapprott@uni-oldenburg.de', 'gianna.spitta@chari

In [None]:
updated_date

In [None]:
for good in good_emails:
    try:
        if check_new(good)==True:
            print(good)
    except:
        print("Failed: "+good)


bounce+unsubscribe_63a37d357b3c5879176835@expertsengage.org
chris.payne@quantum-soup.com
l.zhang.13@bham.ac.uk
mareike.roettger@stud.sbg.ac.at
1axcw14lvvo21r4ihvrx6y2e5z9e52qj4mirsk@bf05x.hubspotemail.net
info@kopieteam.de
unsubscribe@sa49.scsend.com
journals@psychonomic.org
f.fedeli8@campus.unimib.it


In [None]:
for good in good_emails:
    try:
        if likely_spam(good)==True:
            print(good)
    except:
        print("Failed: "+good)

Failed: 
justin.hachenberger@uni-bielefeld.de
scholz_v@ukw.de
Aster_H@ukw.de
Gregorova_K@ukw.de
bounce+unsubscribe_63a37d357b3c5879176835@expertsengage.org
Miriam.Sebold@th-ab.de
Waltmann_M@ukw.de
Failed: timpm@udel.edu
zech_h@ukw.de
gianna.spitta@charite.de
mareike.roettger@stud.sbg.ac.at
Philipp_C@ukw.de
annidittrich@t-online.de
dominic.reichert@ruhr-uni-bochum.de
irina.baskow@charite.de
Scholz_V@ukw.de
shuyan.liu@charite.de
info@kopieteam.de
Markus.Reichert@ruhr-uni-bochum.de
melissa.halil@charite.de
Failed: eran.eldar@mail.huji.ac.il
sercan.kahveci@plus.ac.at
wiko-psychiatrie-ccm@charite.de
Dominic.Reichert@ruhr-uni-bochum.de
Sauter_C@ukw.de
organisation@pug2023.de
maria.garbusow@charite.de
bounce+unsubscribe_64c0018183cde569558222@renownedspeakers.com
Hoehn_U@ukw.de
info@xon-eeg.com
Failed: alon.erdman@mail.huji.ac.il
Failed: l.p.hilbert@fsw.leidenuniv.nl
yuliya.kovalchuk@charite.de
Failed: MKUHN@MCLEAN.HARVARD.EDU
sabrina.doerr@charite.de
Failed: stenner@nicht.dienstli.ch
Failed:

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#| export
class EmailObject:
    def __init__(self, sender, subject, email_id):
        self.sender = sender
        self.subject = subject
        self.email_id = email_id

class EmailClient:
    def __init__(self, server, port, username, password):
        self.mail = imaplib.IMAP4_SSL(server, port)
        self.mail.login(username, password)

    def get_emails(self, since_date=None):
        self.mail.select('"Sent Items"')
        
        if since_date:
            since_date_str = since_date.strftime('%d-%b-%Y')
            status, messages = self.mail.search(None, f'SINCE {since_date_str}')
        else:
            status, messages = self.mail.search(None, "ALL")
            
        email_ids = messages[0].split()
        email_objects = []
        
        for e_id in email_ids:
            status, msg_data = self.mail.fetch(e_id, "(RFC822)")
            msg = email.message_from_bytes(msg_data[0][1])
            
            # Decode subject
            subject, encoding = decode_header(msg["Subject"])[0]
            if isinstance(subject, bytes):
                subject = subject.decode(encoding if encoding else "utf-8")
            
            # Extract sender
            sender = msg["To"]
            
            # Create email object and add to list
            email_objects.append(EmailObject(sender, subject, e_id))
        
        return email_objects


    def logout(self):
        self.mail.logout()



USERNAME = os.environ.get("EXCHANGE_USER")
PASSWORD = os.environ.get("EXCHANGE_PASSWORD")

# Usage example
email_client = EmailClient("msx.tu-dresden.de", 993, USERNAME, PASSWORD)

# Fetch emails since a specific date
since_date = datetime(2022, 8, 22)  # Replace with actual date
emails = email_client.get_emails(since_date=since_date)

good_emails = []
# Print and move emails
for e in emails:
    good_emails.append(parseaddr(e.sender)[1])

# Logout to close the connection
email_client.logout()

good_emails = list(set(good_emails))
print(good_emails)