# Project : Churn emails Inbox
Problem Statement: We have a text file (mbox-short.txt) which records mail activity from various individuals in an open source project development team .
Load the dataset, explore the content and header in the email given from the dataset (mbox-short.txt)



In [1]:
# step1: Explore the content
# Define a function number_of_lines

def number_of_lines(file_path):
    # Returns the total number of lines in the file
    with open(file_path, 'r') as file:
        return len(file.readlines())


In [2]:
# step 2: Write a function count_number_of_lines which returns the count
# of the number of lines starting with Subject: in the file mbox-short.txt

def count_number_of_lines(file_path):
    # Returns the count of lines starting with 'Subject:'
    count = 0
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('Subject:'):
                count += 1
    return count



In [3]:
# step 3: Define a function average_spam_confidence which calculates the average spam confidence and returns it
def average_spam_confidence(file_path):
    # Calculates and returns the average spam confidence
    total_confidence = 0
    count = 0
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('X-DSPAM-Confidence:'):
                try:
                    confidence_value = float(line.split(':')[1].strip())
                    total_confidence += confidence_value
                    count += 1
                except ValueError:
                    continue
    return total_confidence / count if count > 0 else 0



In [4]:

#  step 4: Write a function find_email_sent_days which reads the file 'mbox-short.txt' and 
# categorizes each mail message by which day of the week the email was sent.

def find_email_sent_days(file_path):
    # Returns a dictionary counting the number of emails sent each day of the week
    days_count = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('From '):
                words = line.split()
                if len(words) > 2:
                    day = words[2]
                    days_count[day] = days_count.get(day, 0) + 1
    return days_count



In [5]:

# step 5: Write a function count_message_from_email which reads the file mbox-short.txt.
# This function builds a histogram using a dictionary to count how many messages have come
# from each email address and returns the dictionary.

def count_message_from_email(file_path):
    # Returns a dictionary counting the number of messages from each email address
    email_count = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('From '):
                words = line.split()
                if len(words) > 1:
                    email = words[1]
                    email_count[email] = email_count.get(email, 0) + 1
    return email_count



In [6]:
# step 6: Write a function count_message_from_domain which reads the file mbox-short.txt.
# This function builds a histogram using a dictionary to count how many messages have come from each domain
# (Instead of from email address), and returns the dictionary.

def count_message_from_domain(file_path):
    # Returns a dictionary counting the number of messages from each domain
    domain_count = {}
    with open(file_path, 'r') as file:
        for line in file:
            if line.startswith('From '):
                words = line.split()
                if len(words) > 1:
                    email = words[1]
                    domain = email.split('@')[-1]
                    domain_count[domain] = domain_count.get(domain, 0) + 1
    return domain_count



In [7]:

# Sample usage
file_path = 'mbox-short.txt'
print("Total lines:", number_of_lines(file_path))
print("Subject lines count:", count_number_of_lines(file_path))
print("Average spam confidence:", average_spam_confidence(file_path))
print("Emails sent per day:", find_email_sent_days(file_path))
print("Messages from each email:", count_message_from_email(file_path))
print("Messages from each domain:", count_message_from_domain(file_path))


Total lines: 1910
Subject lines count: 27
Average spam confidence: 0.7507185185185187
Emails sent per day: {'Sat': 1, 'Fri': 20, 'Thu': 6}
Messages from each email: {'stephen.marquard@uct.ac.za': 2, 'louis@media.berkeley.edu': 3, 'zqian@umich.edu': 4, 'rjlowe@iupui.edu': 2, 'cwen@iupui.edu': 5, 'gsilver@umich.edu': 3, 'wagnermr@iupui.edu': 1, 'antranig@caret.cam.ac.uk': 1, 'gopal.ramasammycook@gmail.com': 1, 'david.horwitz@uct.ac.za': 4, 'ray@media.berkeley.edu': 1}
Messages from each domain: {'uct.ac.za': 6, 'media.berkeley.edu': 4, 'umich.edu': 7, 'iupui.edu': 8, 'caret.cam.ac.uk': 1, 'gmail.com': 1}
