In [9]:
import os
import re
import pandas as pd

# Folder path where the CSV files are stored
data_folder = '../tele_data'

# Load all CSV files from the folder
all_files = [os.path.join(data_folder, f) for f in os.listdir(data_folder) if f.endswith('.csv')]

# Combine all CSV files into one DataFrame
df_list = [pd.read_csv(file) for file in all_files]
combined_df = pd.concat(df_list, ignore_index=True)

# Display first few rows of the combined data
print(combined_df.head())


  Channel Title Channel Username  ID  \
0        CheMed       @CheMed123  97   
1        CheMed       @CheMed123  96   
2        CheMed       @CheMed123  95   
3        CheMed       @CheMed123  94   
4        CheMed       @CheMed123  93   

                                             Message  \
0  ⚠️Notice!\nDear esteemed customers,\nDue to fo...   
1  Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...   
2  አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...   
3  Che-Med Trivia #3\n\nምግብና መጠጦች አንዳንድ መድሃኒቶች በደ...   
4  Che-Med Trivia #2\n\nእንደ Ciprofloxacin, Doxycy...   

                        Date                Media Path  
0  2023-02-10 12:23:06+00:00  photos/@CheMed123_97.jpg  
1  2023-02-02 08:58:52+00:00  photos/@CheMed123_96.jpg  
2  2023-02-01 08:59:37+00:00  photos/@CheMed123_95.jpg  
3  2023-01-31 09:19:53+00:00  photos/@CheMed123_94.jpg  
4  2023-01-30 09:45:25+00:00  photos/@CheMed123_93.jpg  


In [4]:
# Remove duplicate records
cleaned_df = combined_df.drop_duplicates()

# Check for duplicate rows
print("Number of duplicates removed:", len(combined_df) - len(cleaned_df))

Number of duplicates removed: 104


In [6]:
# Check for missing values
print("Missing values per column:\n", cleaned_df.isnull().sum())

# Handle missing values (e.g., forward fill, backward fill, or filling with a default value)
cleaned_df.fillna(method='ffill', inplace=True)

Missing values per column:
 Channel Title        0
Channel Username     0
ID                   0
Message             71
Date                 0
Media Path          15
dtype: int64


  cleaned_df.fillna(method='ffill', inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned_df.fillna(method='ffill', inplace=True)


In [7]:
message_df=cleaned_df['Message']
message_df

0      ⚠️Notice!\nDear esteemed customers,\nDue to fo...
1      Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...
2      አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...
3      Che-Med Trivia #3\n\nምግብና መጠጦች አንዳንድ መድሃኒቶች በደ...
4      Che-Med Trivia #2\n\nእንደ Ciprofloxacin, Doxycy...
                             ...                        
350    በኢትዮጵያ እስካሁን ድረስ በዝንጀሮ ፈንጣጣ (MPox) በሽታ የተያዘ ሰው...
351    በኢትዮጵያ እስካሁን ድረስ በዝንጀሮ ፈንጣጣ (MPox) በሽታ የተያዘ ሰው...
352    በኢትዮጵያ እስካሁን ድረስ በዝንጀሮ ፈንጣጣ (MPox) በሽታ የተያዘ ሰው...
353    በኢትዮጵያ እስካሁን ድረስ በዝንጀሮ ፈንጣጣ (MPox) በሽታ የተያዘ ሰው...
354    በኢትዮጵያ እስካሁን ድረስ በዝንጀሮ ፈንጣጣ (MPox) በሽታ የተያዘ ሰው...
Name: Message, Length: 251, dtype: object

In [10]:
def remove_emojis(text):
    emoji_pattern = re.compile(
        "[" 
        "\U0001F600-\U0001F64F"  # emoticons
        "\U0001F300-\U0001F5FF"  # symbols & pictographs
        "\U0001F680-\U0001F6FF"  # transport & map symbols
        "\U0001F700-\U0001F77F"  # alchemical symbols
        "\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        "\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        "\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        "\U0001FA00-\U0001FA6F"  # Chess Symbols
        "\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        "\U00002702-\U000027B0"  # Dingbats
        "\U000024C2-\U0001F251" 
        "]+", 
        flags=re.UNICODE
    )
    return emoji_pattern.sub(r'', text)

# Apply the function to the 'Message' column
cleaned_df['Message'] = cleaned_df['Message'].apply(remove_emojis)

# Display the updated DataFrame
print(cleaned_df.head())

  Channel Title Channel Username  ID  \
0        CheMed       @CheMed123  97   
1        CheMed       @CheMed123  96   
2        CheMed       @CheMed123  95   
3        CheMed       @CheMed123  94   
4        CheMed       @CheMed123  93   

                                             Message  \
0  Notice!\nDear esteemed customers,\nDue to four...   
1  Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...   
2  አዚትሮማይሲን በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ ሲሆን...   
3  Che-Med Trivia #3\n\nምግብና መጠጦች አንዳንድ መድሃኒቶች በደ...   
4  Che-Med Trivia #2\n\nእንደ Ciprofloxacin, Doxycy...   

                        Date                Media Path  
0  2023-02-10 12:23:06+00:00  photos/@CheMed123_97.jpg  
1  2023-02-02 08:58:52+00:00  photos/@CheMed123_96.jpg  
2  2023-02-01 08:59:37+00:00  photos/@CheMed123_95.jpg  
3  2023-01-31 09:19:53+00:00  photos/@CheMed123_94.jpg  
4  2023-01-30 09:45:25+00:00  photos/@CheMed123_93.jpg  


In [11]:
# Standardize date format
cleaned_df['Date'] = pd.to_datetime(cleaned_df['Date'], errors='coerce')

# Convert all messages to lowercase (if needed)
cleaned_df['Message'] = cleaned_df['Message'].str.lower()

# Validate data (example: ensuring valid date ranges or specific field content)
valid_data = cleaned_df[cleaned_df['Date'] >= '2020-01-01']# Save cleaned data to a new CSV file
cleaned_df.to_csv('cleaned_tele_data.csv', index=False)