In [1]:
import sys
import csv
import pandas as pd

# Function to check if a sender has more than one '@' in the email
def has_multiple_at(sender):
    try:
        email = sender.split('<')[-1].strip('> ').strip() if '<' in sender and '>' in sender else sender.strip()
        return email.count('@') > 1  # Return True if more than one '@'
    except Exception:
        return False

# Increase the field size limit for CSV reading
max_int = sys.maxsize
while True:
    try:
        csv.field_size_limit(max_int)
        break
    except OverflowError:
        max_int = int(max_int / 10)

# File paths
file_names = ['CEAS-08.csv', 'TREC-05.csv', 'TREC-06.csv', 'TREC-07.csv']

# Load and combine multiple CSV files into one DataFrame
dataframes = []
for file_name in file_names:
    try:
        dataframes.append(pd.read_csv(file_name, engine='python'))
    except Exception as e:
        print(f"Error reading {file_name}: {e}")

combined_df = pd.concat(dataframes, ignore_index=True)
print(f"Raw Dataset Shape: {combined_df.shape}")
print(f"Raw Dataset Null Counts:\n{combined_df.isnull().sum()}")

# Remove rows with invalid labels
combined_df = combined_df[combined_df['label'].isin([0, 1])]

# Drop rows with missing critical values
critical_columns = ['sender', 'receiver', 'date', 'subject', 'body', 'label']
combined_df.dropna(subset=critical_columns, inplace=True)

# Convert 'date' to datetime
combined_df['date'] = pd.to_datetime(combined_df['date'], format='%a, %d %b %Y %H:%M:%S %z', errors='coerce', utc=True)
combined_df.dropna(subset=['date'], inplace=True)  # Drop rows with invalid dates

# Remove rows with senders having multiple '@'
improper_senders = combined_df[combined_df['sender'].apply(has_multiple_at)]
print(f"\nSenders with more than one '@': {improper_senders['sender'].unique()}")
print(f"Count of improper sender rows: {len(improper_senders)}")
combined_df = combined_df[~combined_df['sender'].apply(has_multiple_at)]

# Reset the index
combined_df.reset_index(drop=True, inplace=True)

# Display cleaned DataFrame info
print(f"\nCleaned Dataset Shape: {combined_df.shape}")
print(combined_df.info())

# Save the cleaned data to a CSV file
output_file = 'combined_cleaned_data_updated.csv'
combined_df.to_csv(output_file, index=False)
print(f"Combined and cleaned data saved as '{output_file}'.")


Raw Dataset Shape: (165358, 7)
Raw Dataset Null Counts:
sender       284
receiver    3542
date        2312
subject     2750
body         619
label        772
urls         772
dtype: int64

Senders with more than one '@': ['Daily Top 10 <"eojamli_1977@89webdesign.co.uk>"@teff16.cs.uwaterloo.ca>'
 'lejmi <"lejmi-vernaht@stfrancis.k12.mn.us>"@teff16.cs.uwaterloo.ca>'
 'Hyatt <"torres-vatermo@stfrancis.k12.mn.us>"@teff16.cs.uwaterloo.ca>'
 'Daily Top 10 <"ote-ableips@3rx.b.tep1.com>"@teff16.cs.uwaterloo.ca>'
 'Daily Top 10 <"enhahtu@cornwallacf.co.uk>"@teff16.cs.uwaterloo.ca>'
 'Daily Top 10 <"urendes@evolusoft.com>"@teff16.cs.uwaterloo.ca>'
 'Daily Top 10 <"wolfter_1968@banklife.ru>"@teff16.cs.uwaterloo.ca>'
 '"clima VI <jxczn-yg@deis.unibo.it>"@cs.orst.edu'
 'Daily Top 10 <"srabzna1980@3gforfree.com>"@teff16.cs.uwaterloo.ca>'
 'Daily Top 10 <"hcnuteg1999@posteasy.com>"@teff16.cs.uwaterloo.ca>'
 'Daily Top 10 <"pfelter1960@acnj.org>"@teff16.cs.uwaterloo.ca>'
 'Daily Top 10 <"Marcell-russt

In [2]:
combined_df.shape

(158375, 7)