In [1]:
import pandas as pd
import logging
import os
import sys
sys.path.append('../')

from dotenv import load_dotenv
load_dotenv()

import sys
sys.path.append('../')

> # Creating a data warehouse for Ethiopian medical business data scraped from Telegram channels

> # Telegram Access keys

In [2]:
api_id = os.getenv('TELEGRAM_API_ID')
api_hash = os.getenv('TELEGRAM_API_HASH')
phone = os.getenv('TELEGRAM_PHONE')

In [3]:

logging.basicConfig(filename='../logs/scraping.log', level=logging.INFO, 
                    format='%(asctime)s:%(levelname)s:%(message)s')

RAW_DATA_PATH = '../docs/raw/'
IMAGE_DATA_PATH = '../docs/images/'

> # Scrapping

In [4]:
import asyncio
from scripts.scrapping import scrapping
async def main():
    await scrapping(logging, api_id, api_hash, RAW_DATA_PATH, IMAGE_DATA_PATH)

await main()


>> ## Scrapped Data

In [5]:
data = pd.read_csv('../docs/raw/all_scraped_messages.csv')
data[data['channel_name']=='CheMed123'].head()

Unnamed: 0,channel_name,message_id,date,text,image_path
10,CheMed123,97,2023-02-10 12:23:06,"⚠️**Notice!\r\n**Dear esteemed customers,\r\nD...",../docs/images\photo_2025-01-31_09-39-51.jpg
11,CheMed123,96,2023-02-02 08:58:52,Mela-One በውስጡ ሆርሞን ያለው ድንገተኛ ወሊድ መቆጣጠርያ ሲሆን ያለ...,../docs/images\photo_2025-01-31_09-39-52.jpg
12,CheMed123,95,2023-02-01 08:59:37,**አዚትሮማይሲን** በሃኪም መድሃኒት ማዘዣ ከሚታዘዙ አንቲባዮቲኮች አንዱ...,../docs/images\photo_2025-01-31_09-39-52 (1).jpg
13,CheMed123,94,2023-01-31 09:19:53,**Che-Med Trivia #3\r\n\r\n**ምግብና መጠጦች አንዳንድ መ...,../docs/images\photo_2025-01-31_09-39-53.jpg
14,CheMed123,93,2023-01-30 09:45:25,**Che-Med Trivia #2\r\n\r\n**እንደ Ciprofloxacin...,../docs/images\photo_2025-01-31_09-39-53 (1).jpg


> # Clean and Standardize Data

In [6]:
# Now import the modules
from scripts.data_cleaning import load_csv,clean_text,clean_dataframe,save_cleaned_data



In [7]:
df = load_csv("../docs/raw/all_scraped_messages.csv")

# Show first few rows
df.head(10)


Unnamed: 0,channel_name,message_id,date,text,image_path
0,DoctorsET,864,2023-12-18 17:04:02,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...,
1,DoctorsET,863,2023-11-03 16:14:39,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,
2,DoctorsET,862,2023-10-02 16:37:39,ሞት በስኳር \r\n\r\nለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን...,
3,DoctorsET,861,2023-09-16 07:54:32,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\r\n\...,
4,DoctorsET,860,2023-09-01 16:16:15,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,
5,DoctorsET,859,2023-08-29 17:20:05,👇👇👇👇👇👇 https://youtu.be/-AR1KO2DbFw?si=47cXLZt...,
6,DoctorsET,848,2022-08-02 17:42:08,ክረምቱን ስፖርት መስራት አስበው ጂም ለመግባት ካልቻሉ ባሉበት ቦታ ሆነው...,
7,DoctorsET,847,2022-06-12 17:15:47,ስፖርት የመስራት ሱስ ይኖር ይሆን?\r\n\r\nበአሁኑ ወቅት ብዙ የስፖር...,
8,DoctorsET,846,2022-05-31 17:51:13,ድንገተኛ አደጋ / የአጥንት ስብራት\r\n\r\nአያርገውና ድንገተኛ የሆነ...,
9,DoctorsET,845,2022-05-20 18:04:53,ከትንሽ ግዚያት በፊት ስፖርት መስራት እንደ ቅንጦት ይታይ ነበር አሁን ላ...,


In [8]:
df_cleaned = clean_dataframe(df)

# Display cleaned dataset
df_cleaned.head(10)

Unnamed: 0,channel_title,message_id,message_date,message,media_path,emoji_used,youtube_links
0,DoctorsET,864,2023-12-18 17:04:02,"በቀን አንዴ ብቻ የሚባለው የቢዝነስ አማካሪ \r \r በ 10,000 ብር ...",No image,👈👈👇👇,https://youtu.be/5DBoEm-8kmA?si=LDLuEecNfULJVD...
1,DoctorsET,863,2023-11-03 16:14:39,ዶክተርስ ኢትዮጵያ በ አዲስ አቀራረብ በ ቴሌቪዥን ፕሮግራሙን ለመጀመር ከ...,No image,👇,https://youtu.be/gwVN5eJQpko?si=xARsSxIEdZtE91GY
2,DoctorsET,862,2023-10-02 16:37:39,ሞት በስኳር \r \r ለልጆቻችን የምናሲዘው ምሳቃ ሳናቀው እድሚያቸውን ይ...,No image,No emoji,https://youtu.be/oHiSRrNF7I0?si=Absgm414YSt_kjNq
3,DoctorsET,861,2023-09-16 07:54:32,ከ HIV የተፈወሰ ሰው አጋጥሟችሁ ያቃል ? ፈውስ እና ህክምና ?\r \r...,No image,👇👇👇👇,https://youtu.be/tTeErZxIh_Q?si=jKHyfWcC3sfXbC8L
4,DoctorsET,860,2023-09-01 16:16:15,በቅርብ ጊዜ በሃገራችን ላይ እየተስተዋለ ያለ የተመሳሳይ ፆታ ( Homos...,No image,No emoji,https://youtu.be/0k65P5ouw7s?si=qaUgo75bUa3AMQxD
5,DoctorsET,859,2023-08-29 17:20:05,**\r ዶክተርስ ኢትዮጽያ በአዲስ ፕሮገራም ጀመረ**\r \r ማረጥ (*...,No image,👇👇👇👇👇👇,https://youtu.be/-AR1KO2DbFw?si=47cXLZtlmhx1Nl...
6,DoctorsET,848,2022-08-02 17:42:08,ክረምቱን ስፖርት መስራት አስበው ጂም ለመግባት ካልቻሉ ባሉበት ቦታ ሆነው...,No image,👇👇👇👇👇,https://youtu.be/0uiTzjEbh90
7,DoctorsET,847,2022-06-12 17:15:47,ስፖርት የመስራት ሱስ ይኖር ይሆን?\r \r በአሁኑ ወቅት ብዙ የስፖርት ...,No image,👇👇👇👇👇👇,https://youtu.be/WPlRuRtQXN8
8,DoctorsET,846,2022-05-31 17:51:13,ድንገተኛ አደጋ / የአጥንት ስብራት\r \r አያርገውና ድንገተኛ የሆነ አ...,No image,👇👇👇👇👇👇👇,https://youtu.be/QI-8oqW80uI
9,DoctorsET,845,2022-05-20 18:04:53,ከትንሽ ግዚያት በፊት ስፖርት መስራት እንደ ቅንጦት ይታይ ነበር አሁን ላ...,No image,👇👇👇👇👇👇,https://youtu.be/_IEWt07bECg


> # Check for missing values in the cleaned DataFrame

In [9]:

missing_values = df_cleaned.isnull().sum()
missing_values[missing_values > 0]  # Display only columns with missing values


Series([], dtype: int64)

> ### The Datasets have no missing values

> # Save cleaned data to CSV


In [10]:
df_cleaned.to_csv("../docs/cleaned.csv", index=False)
print("✅ Cleaned data saved successfully to '../docs/cleaned_data.csv'.")


✅ Cleaned data saved successfully to '../docs/cleaned_data.csv'.


## Connect to Database

In [11]:
from scripts.database_setup import get_db_connection, create_table, insert_data


In [12]:
engine = get_db_connection()

###  Create Table in PostgreSQL

In [13]:
create_table(engine)