In [1]:
import pandas as pd
import re


In [2]:
# Step 1: Read the CSV file with the correct encoding
try:
    df = pd.read_csv('emails_db.csv', encoding='utf-8-sig')
except UnicodeDecodeError:
    # If decoding fails, try 'latin1' encoding
    df = pd.read_csv('emails_db.csv', encoding='latin1')


In [3]:
# Strip whitespace from column names
df.columns = df.columns.str.strip()


In [5]:
if 'email' not in df.columns:
    print("The 'email' column was not found in the CSV file. Available columns are:", df.columns.tolist())

In [6]:
    # Strip whitespace from email addresses
    df['email'] = df['email'].str.strip()


In [7]:
    # Remove rows where 'email' is missing or empty
    df = df[df['email'].notnull()]
    df = df[df['email'] != '']


In [8]:
    # Remove duplicates
    df = df.drop_duplicates(subset=['email'])


In [9]:
    # Optionally, validate email addresses using a regex
    email_regex = r'^[\w\.-]+@[\w\.-]+\.\w+$'
    df = df[df['email'].apply(lambda x: re.match(email_regex, x) is not None)]


In [10]:
    # Reset index
    df.reset_index(drop=True, inplace=True)


In [13]:
    # Save the cleaned data to a new CSV file
    df.to_csv('emails_db_cleaned.csv', index=False)
    print("Data cleaned successfully. Cleaned data saved to 'emails_db_cleaned.csv'.")


Data cleaned successfully. Cleaned data saved to 'emails_db_cleaned.csv'.


In [14]:
# Load and display the cleaned data
df_cleaned = pd.read_csv('emails_db_cleaned.csv')
print("First few rows of the cleaned data:")
print(df_cleaned.head())


First few rows of the cleaned data:
                         email
0       1980bhagaban@gmail.com
1        tapacharjee@gmail.com
2  sunitasarkar@rediffmail.com
3        dharsourish@gmail.com
4        abhi.021983@gmail.com


In [15]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('emails_db_cleaned.csv')

# Identify rows where the 'email' column is missing or empty
missing_emails = df[df['email'].isna() | df['email'].str.strip().eq('')]

# Output the rows with missing 'email' values
print("Rows with missing emails:")
print(missing_emails)


Rows with missing emails:
Empty DataFrame
Columns: [email]
Index: []


In [18]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('emails_db.csv')

# Check for duplicate rows
duplicate_rows = df[df.duplicated(keep=False)]
if not duplicate_rows.empty:
    print("Duplicate Rows Found:")
    # Reset index to get original row numbers
    duplicate_rows = duplicate_rows.reset_index()
    print(duplicate_rows[['index'] + list(df.columns)])
else:
    print("No duplicate rows found.")

# Check for duplicate email IDs
duplicate_emails = df[df['email'].duplicated(keep=False)]
if not duplicate_emails.empty:
    print("\nDuplicate Email IDs Found:")
    # Reset index to get original row numbers
    duplicate_emails = duplicate_emails.reset_index()
    print(duplicate_emails[['index', 'email']])
else:
    print("\nNo duplicate email IDs found.")


No duplicate rows found.

No duplicate email IDs found.


In [19]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('emails_db.csv')

# Ensure that 'email' column exists
if 'email' not in df.columns:
    raise ValueError("The 'email' column is not present in the CSV file.")

# Identify duplicate email IDs
# 'keep=False' marks all duplicates as True
duplicate_emails = df[df['email'].duplicated(keep=False)].copy()

if not duplicate_emails.empty:
    # Reset index to get original row numbers
    duplicate_emails.reset_index(inplace=True)
    print("Duplicate Email IDs Found:")
    print(duplicate_emails[['index', 'email']])
else:
    print("No duplicate email IDs found.")


No duplicate email IDs found.


In [22]:
import pandas as pd

def find_empty_and_duplicate_emails(file_path):
    # Step 1: Load the CSV file
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"File '{file_path}' not found. Please check the file path.")
        return
    except pd.errors.EmptyDataError:
        print(f"File '{file_path}' is empty.")
        return
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return

    # Step 2: Ensure the necessary columns 'index' and 'email' exist
    if 'index' not in df.columns or 'email' not in df.columns:
        print("The CSV file must contain both 'index' and 'email' columns.")
        return

    # Step 3: Check for empty rows in the 'email' column
    empty_email_rows = df[df['email'].isnull() | (df['email'].str.strip() == '')].copy()

    if not empty_email_rows.empty:
        print("\nEmpty Rows in 'email' Column Found:")
        print(empty_email_rows[['index', 'email']])
    else:
        print("\nNo empty rows in the 'email' column found.")

    # Step 4: Remove leading/trailing spaces from 'email' column
    df['email'] = df['email'].str.strip()

    # Step 5: Check for duplicate rows
    duplicate_rows = df[df.duplicated(keep=False)].copy()

    if not duplicate_rows.empty:
        print("\nDuplicate Rows Found:")
        print(duplicate_rows[['index', 'email']])
    else:
        print("\nNo duplicate rows found.")

    # Step 6: Check for duplicate email addresses (case-insensitive)
    df['email_lower'] = df['email'].str.lower()
    duplicate_emails = df[df['email_lower'].duplicated(keep=False)].copy()

    if not duplicate_emails.empty:
        print("\nDuplicate Email IDs Found (Case-Insensitive):")
        print(duplicate_emails[['index', 'email']])
    else:
        print("\nNo duplicate email IDs found.")

# Replace 'emails_db.csv' with the actual file path
file_path = 'emails_db.csv'
find_empty_and_duplicate_emails(file_path)



No empty rows in the 'email' column found.

No duplicate rows found.

No duplicate email IDs found.


In [23]:
import pandas as pd

def find_empty_email_rows(file_path):
    # Step 1: Load the CSV file
    try:
        df = pd.read_csv(file_path)
    except FileNotFoundError:
        print(f"File '{file_path}' not found. Please check the file path.")
        return
    except pd.errors.EmptyDataError:
        print(f"File '{file_path}' is empty.")
        return
    except Exception as e:
        print(f"An error occurred while reading the file: {e}")
        return

    # Step 2: Ensure the 'email' column exists
    if 'email' not in df.columns:
        print("The 'email' column is not present in the CSV file.")
        return

    # Step 3: Check for empty rows in the 'email' column
    empty_email_rows = df[df['email'].isnull() | (df['email'].str.strip() == '')].copy()

    if not empty_email_rows.empty:
        print("\nEmpty Rows in 'email' Column Found:")
        print(empty_email_rows[['index', 'email']])
    else:
        print("\nNo empty rows in the 'email' column found.")

# Replace 'emails_db.csv' with the actual file path
file_path = 'emails_db.csv'
find_empty_email_rows(file_path)



No empty rows in the 'email' column found.


In [24]:
pip install pandas streamlit streamlit-option-menu






[notice] A new release of pip is available: 24.1.1 -> 24.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [29]:


import streamlit as st
import pandas as pd
import os
from streamlit_option_menu import option_menu
# Define the path to your main database CSV file
database_path = 'emails_db.csv'

# Load the main database
if os.path.exists(database_path):
    df_database = pd.read_csv(database_path)
    # Ensure the 'email' column exists
    if 'email' not in df_database.columns:
        st.error("The 'email' column is not present in the main database.")
    else:
        df_database['email'] = df_database['email'].str.strip().str.lower()
        df_database.drop_duplicates(subset='email', inplace=True)
else:
    st.warning("Main database not found. A new one will be created upon updating.")
    df_database = pd.DataFrame(columns=['email'])

def clean_email_data(df, email_column='email'):
    # Convert to lowercase and strip whitespace
    df[email_column] = df[email_column].str.strip().str.lower()
    
    # Remove rows with missing or invalid emails
    df = df[df[email_column].notnull()]
    df = df[df[email_column] != '']
    
    # Remove duplicates within the uploaded file
    df = df.drop_duplicates(subset=email_column)
    
    return df

def find_duplicates(uploaded_emails, database_emails):
    # Identify duplicates between uploaded_emails and database_emails
    duplicates = uploaded_emails[uploaded_emails.isin(database_emails)]
    unique_emails = uploaded_emails[~uploaded_emails.isin(database_emails)]
    return duplicates, unique_emails

def update_main_database(unique_emails_df, database_path):
    # Append unique emails to the main database
    updated_db = pd.concat([df_database, unique_emails_df]).drop_duplicates(subset='email').reset_index(drop=True)
    
    # Save the updated database
    updated_db.to_csv(database_path, index=False)
    return updated_db
