In [9]:

def clean_telegram_data(input_file_path, output_file_path):
    # Read the CSV file
    df = pd.read_csv(input_file_path)

    # Specify the required columns
    required_columns = ["Channel Title", "Channel Username", "Message", "Date", "Media Path"]

    # Clean the DataFrame to keep only the required columns
    df = df[required_columns]

    # Removing duplicates
    before_shape = df.shape
    df.drop_duplicates(inplace=True)
    after_shape = df.shape
    print(f"Removed {before_shape[0] - after_shape[0]} duplicate rows.")

    # Handling missing values (you can customize this part as needed)
    df.dropna(inplace=True)  # This will remove any rows with missing values

    # Standardizing formats
    df['Channel Title'] = df['Channel Title'].str.title()
    
    # Convert 'Date' to datetime
    df['Date'] = pd.to_datetime(df['Date'], errors='coerce')

    # Handling timezone-aware datetime objects
    # Here we assume that your data is in UTC
    now_utc = pd.Timestamp.now(tz='UTC')

    # Data validation: Check for future dates
    future_dates = df[df['Date'] > now_utc]
    if not future_dates.empty:
        print(f"Found {len(future_dates)} future dates.")
    df = df[df['Date'] <= now_utc]

    # Save the cleaned DataFrame to a new CSV file
    df.to_csv(output_file_path, index=False)
    print(f"Cleaned data saved to: {output_file_path}")

# Example usage
input_file = '../Data/merged_telegram_data.csv'
output_file = '../Data/validated_telegram_data.csv'
# Call the function
clean_telegram_data(input_file, output_file)

# Verify the cleaned data
cleaned_df = pd.read_csv(output_file)
print("Cleaned Data Preview:")
print(cleaned_df.head())  # Display the first few rows of the cleaned data

Removed 3123 duplicate rows.
Cleaned data saved to: ../Data/validated_telegram_data.csv
Cleaned Data Preview:
            Channel Title Channel Username  \
0  Health Hub Et: ስለ ጤና 🩺         @HHETatI   
1  Health Hub Et: ስለ ጤና 🩺         @HHETatI   
2  Health Hub Et: ስለ ጤና 🩺         @HHETatI   
3  Health Hub Et: ስለ ጤና 🩺         @HHETatI   
4  Health Hub Et: ስለ ጤና 🩺         @HHETatI   

                                             Message  \
0  # የስኳር በሽታ እና የአይን ጤና: ማወቅ ያለብዎት\n\nየስኳር በሽታ በ...   
1                                                ...   
2  From Hero to Burnout: The Reality for Ethiopia...   
3  የትርፍ አንጀት ብግነት: ማወቅ ያለብዎት\n\nትርፍ አንጀት (Appendi...   
4  # Ethiopia's Healthcare: Progress vs. Reality?...   

                        Date                Media Path  
0  2024-10-06 10:24:28+00:00  photos\@HHETatI_2034.jpg  
1  2024-10-06 10:10:50+00:00  photos\@HHETatI_2032.jpg  
2  2024-09-30 12:18:06+00:00  photos\@HHETatI_2027.jpg  
3  2024-09-28 13:29:51+00:00  photos\@HHETatI_20