In [3]:
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime, timedelta
import time
import json
import re
from urllib.parse import urljoin
import warnings
warnings.filterwarnings('ignore')

class USDEconomicCalendarScraper:
    def __init__(self, use_selenium=True):
        self.use_selenium = use_selenium
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'ar,en-US;q=0.9,en;q=0.8',
            'Accept-Encoding': 'gzip, deflate, br',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0'
        }
        
        if self.use_selenium:
            self.setup_selenium()
        else:
            self.session = requests.Session()
            self.session.headers.update(self.headers)
    
    def setup_selenium(self):
        """Setup Selenium WebDriver"""
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # تشغيل في الخلفية
        chrome_options.add_argument('--no-sandbox')
        chrome_options.add_argument('--disable-dev-shm-usage')
        chrome_options.add_argument('--disable-blink-features=AutomationControlled')
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        chrome_options.add_argument(f'--user-agent={self.headers["User-Agent"]}')
        
        try:
            self.driver = webdriver.Chrome(options=chrome_options)
            self.driver.execute_script("Object.defineProperty(navigator, 'webdriver', {get: () => undefined})")
        except Exception as e:
            print(f"Error setting up Selenium: {e}")
            print("Falling back to requests method...")
            self.use_selenium = False
            self.session = requests.Session()
            self.session.headers.update(self.headers)
    
    def scrape_investing_api(self, start_date, end_date):
        """
        استخدام API من Investing.com للحصول على البيانات
        """
        data = []
        
        try:
            # استخدام API endpoint للتقويم الاقتصادي
            api_url = "https://api.investing.com/api/financialdata/assets/economicCalendar"
            
            # معاملات API
            params = {
                'from': start_date.strftime('%Y-%m-%d'),
                'to': end_date.strftime('%Y-%m-%d'),
                'countries': 'united-states',
                'importance': '2,3',  # متوسط وعالي
                'currencies': 'USD'
            }
            
            headers = self.headers.copy()
            headers.update({
                'X-Requested-With': 'XMLHttpRequest',
                'Referer': 'https://www.investing.com/economic-calendar/',
                'Accept': 'application/json, text/javascript, */*; q=0.01'
            })
            
            response = requests.get(api_url, params=params, headers=headers, timeout=30)
            
            if response.status_code == 200:
                json_data = response.json()
                if 'data' in json_data:
                    for event in json_data['data']:
                        event_data = {
                            'date': event.get('date', ''),
                            'time': event.get('time', ''),
                            'event_name': event.get('event', ''),
                            'country': 'USD',
                            'importance': int(event.get('importance', 0)),
                            'previous': event.get('previous', ''),
                            'forecast': event.get('forecast', ''),
                            'actual': event.get('actual', ''),
                            'source': 'Investing.com API'
                        }
                        data.append(event_data)
            
        except Exception as e:
            print(f"Error with Investing API: {e}")
        
        return data
    
    def scrape_alternative_sources(self):
        """
        استخدام مصادر بديلة للبيانات الاقتصادية
        """
        data = []
        
        # مصدر 1: Economic Calendar من MarketWatch
        try:
            marketwatch_data = self.scrape_marketwatch()
            data.extend(marketwatch_data)
        except Exception as e:
            print(f"MarketWatch scraping failed: {e}")
        
        # مصدر 2: Yahoo Finance Economic Calendar
        try:
            yahoo_data = self.scrape_yahoo_finance()
            data.extend(yahoo_data)
        except Exception as e:
            print(f"Yahoo Finance scraping failed: {e}")
        
        # مصدر 3: TradingView Economic Calendar
        try:
            tradingview_data = self.scrape_tradingview()
            data.extend(tradingview_data)
        except Exception as e:
            print(f"TradingView scraping failed: {e}")
        
        return data
    
    def scrape_marketwatch(self):
        """
        Scrape من MarketWatch Economic Calendar
        """
        data = []
        try:
            url = "https://www.marketwatch.com/economy-politics/calendar"
            
            if self.use_selenium:
                self.driver.get(url)
                time.sleep(3)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            else:
                response = self.session.get(url, timeout=30)
                soup = BeautifulSoup(response.content, 'html.parser')
            
            # البحث عن جدول الأحداث الاقتصادية
            calendar_table = soup.find('table', class_='table--economic-calendar')
            if calendar_table:
                rows = calendar_table.find_all('tr')[1:]  # تجاهل الهيدر
                
                for row in rows:
                    cells = row.find_all('td')
                    if len(cells) >= 6:
                        event_data = {
                            'date': cells[0].get_text(strip=True),
                            'time': cells[1].get_text(strip=True),
                            'event_name': cells[2].get_text(strip=True),
                            'country': 'USD',
                            'importance': self.parse_importance(cells[3]),
                            'previous': cells[4].get_text(strip=True),
                            'forecast': cells[5].get_text(strip=True),
                            'actual': cells[6].get_text(strip=True) if len(cells) > 6 else '',
                            'source': 'MarketWatch'
                        }
                        data.append(event_data)
        except Exception as e:
            print(f"MarketWatch error: {e}")
        
        return data
    
    def scrape_yahoo_finance(self):
        """
        Scrape من Yahoo Finance Economic Calendar
        """
        data = []
        try:
            url = "https://finance.yahoo.com/calendar/economic"
            
            if self.use_selenium:
                self.driver.get(url)
                time.sleep(3)
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            else:
                response = self.session.get(url, timeout=30)
                soup = BeautifulSoup(response.content, 'html.parser')
            
            # البحث عن البيانات
            events = soup.find_all('tr', {'data-test': 'calendar-row'})
            
            for event in events:
                try:
                    cells = event.find_all('td')
                    if len(cells) >= 4:
                        event_name = cells[0].get_text(strip=True)
                        if 'USD' in event_name or 'US' in event_name:
                            event_data = {
                                'date': cells[1].get_text(strip=True),
                                'time': cells[2].get_text(strip=True),
                                'event_name': event_name,
                                'country': 'USD',
                                'importance': 2,  # افتراضي متوسط
                                'previous': cells[3].get_text(strip=True) if len(cells) > 3 else '',
                                'forecast': cells[4].get_text(strip=True) if len(cells) > 4 else '',
                                'actual': cells[5].get_text(strip=True) if len(cells) > 5 else '',
                                'source': 'Yahoo Finance'
                            }
                            data.append(event_data)
                except Exception:
                    continue
                    
        except Exception as e:
            print(f"Yahoo Finance error: {e}")
        
        return data
    
    def scrape_tradingview(self):
        """
        Scrape من TradingView Economic Calendar
        """
        data = []
        try:
            url = "https://www.tradingview.com/economic-calendar/"
            
            if self.use_selenium:
                self.driver.get(url)
                time.sleep(5)  # وقت أطول للتحميل
                soup = BeautifulSoup(self.driver.page_source, 'html.parser')
            else:
                response = self.session.get(url, timeout=30)
                soup = BeautifulSoup(response.content, 'html.parser')
            
            # البحث عن البيانات في TradingView
            events = soup.find_all('tr', class_='row-RdUXZpkv')
            
            for event in events:
                try:
                    # استخراج بيانات الحدث
                    country_cell = event.find('span', class_='currency-BbubK_kl')
                    if country_cell and 'USD' in country_cell.get_text():
                        
                        time_cell = event.find('time')
                        event_cell = event.find('a', class_='title-BbubK_kl')
                        impact_cell = event.find('span', class_='impact-BbubK_kl')
                        
                        event_data = {
                            'date': time_cell.get('datetime', '') if time_cell else '',
                            'time': time_cell.get_text(strip=True) if time_cell else '',
                            'event_name': event_cell.get_text(strip=True) if event_cell else '',
                            'country': 'USD',
                            'importance': self.parse_tradingview_impact(impact_cell),
                            'previous': '',
                            'forecast': '',
                            'actual': '',
                            'source': 'TradingView'
                        }
                        
                        # البحث عن القيم
                        value_cells = event.find_all('span', class_='value-BbubK_kl')
                        if len(value_cells) >= 3:
                            event_data['actual'] = value_cells[0].get_text(strip=True)
                            event_data['forecast'] = value_cells[1].get_text(strip=True)
                            event_data['previous'] = value_cells[2].get_text(strip=True)
                        
                        data.append(event_data)
                        
                except Exception:
                    continue
                    
        except Exception as e:
            print(f"TradingView error: {e}")
        
        return data
    
    def parse_importance(self, cell):
        """تحليل مستوى الأهمية من الخلية"""
        text = cell.get_text(strip=True).lower()
        if 'high' in text or 'عالي' in text:
            return 3
        elif 'medium' in text or 'متوسط' in text:
            return 2
        else:
            return 1
    
    def parse_tradingview_impact(self, cell):
        """تحليل مستوى التأثير في TradingView"""
        if not cell:
            return 1
        
        class_names = cell.get('class', [])
        if 'high' in ' '.join(class_names).lower():
            return 3
        elif 'medium' in ' '.join(class_names).lower():
            return 2
        else:
            return 1
    
    def create_sample_data(self):
        """
        إنشاء بيانات عينة للأجندة الاقتصادية للدولار
        """
        print("Creating sample USD economic calendar data...")
        
        # أحداث اقتصادية مهمة للدولار
        important_events = [
            'Non-Farm Payrolls',
            'Federal Reserve Interest Rate Decision',
            'Consumer Price Index (CPI)',
            'Producer Price Index (PPI)',
            'GDP Growth Rate',
            'Unemployment Rate',
            'Retail Sales',
            'Industrial Production',
            'Consumer Confidence',
            'ISM Manufacturing PMI',
            'ISM Services PMI',
            'FOMC Meeting Minutes',
            'Core PCE Price Index',
            'Initial Jobless Claims',
            'Durable Goods Orders',
            'Housing Starts',
            'Existing Home Sales',
            'New Home Sales',
            'Trade Balance',
            'Factory Orders'
        ]
        
        data = []
        start_date = datetime(2020, 1, 1)
        end_date = datetime(2025, 12, 31)
        
        current_date = start_date
        while current_date <= end_date:
            # إضافة أحداث شهرية
            for event in important_events[:10]:  # أهم 10 أحداث
                if current_date.day <= 28:  # تجنب مشاكل نهاية الشهر
                    event_date = current_date.replace(day=min(28, current_date.day + len(data) % 28))
                    
                    # تحديد مستوى الأهمية
                    if event in ['Non-Farm Payrolls', 'Federal Reserve Interest Rate Decision', 'Consumer Price Index (CPI)']:
                        importance = 3  # عالي
                    else:
                        importance = 2  # متوسط
                    
                    event_data = {
                        'date': event_date.strftime('%Y-%m-%d'),
                        'time': f"{9 + (len(data) % 8)}:30",
                        'event_name': event,
                        'country': 'USD',
                        'importance': importance,
                        'previous': f'{(len(data) % 100) / 10:.1f}%',
                        'forecast': f'{((len(data) + 1) % 100) / 10:.1f}%',
                        'actual': f'{((len(data) + 2) % 100) / 10:.1f}%',
                        'source': 'Sample Data'
                    }
                    data.append(event_data)
            
            # الانتقال للشهر التالي
            if current_date.month == 12:
                current_date = current_date.replace(year=current_date.year + 1, month=1)
            else:
                current_date = current_date.replace(month=current_date.month + 1)
        
        return data
    
    def scrape_economic_calendar(self, start_year=2020, end_year=2025):
        """
        الدالة الرئيسية لجمع بيانات الأجندة الاقتصادية
        """
        print(f"Starting USD Economic Calendar scraping from {start_year} to {end_year}")
        
        all_data = []
        
        # محاولة الطرق المختلفة
        print("Trying API methods...")
        start_date = datetime(start_year, 1, 1)
        end_date = datetime(end_year, 12, 31)
        
        # محاولة API
        api_data = self.scrape_investing_api(start_date, end_date)
        if api_data:
            all_data.extend(api_data)
            print(f"Got {len(api_data)} events from API")
        
        # محاولة المصادر البديلة
        print("Trying alternative sources...")
        alt_data = self.scrape_alternative_sources()
        if alt_data:
            all_data.extend(alt_data)
            print(f"Got {len(alt_data)} events from alternative sources")
        
        # إذا لم نحصل على بيانات، ننشئ بيانات عينة
        if not all_data:
            print("No real data found, creating sample data...")
            all_data = self.create_sample_data()
        
        # تحويل إلى DataFrame
        df = pd.DataFrame(all_data)
        
        if not df.empty:
            # تنظيف ومعالجة البيانات
            df = self.clean_data(df)
            
            # فلترة للأحداث متوسطة وعالية الأهمية فقط
            df = df[df['importance'] >= 2]
            
            # ترتيب حسب التاريخ
            df = df.sort_values('date')
            
            print(f"Successfully processed {len(df)} USD economic events")
            return df
        else:
            print("No data found")
            return pd.DataFrame()
    
    def clean_data(self, df):
        """
        تنظيف ومعالجة البيانات
        """
        # إزالة التكرارات
        df = df.drop_duplicates(subset=['date', 'event_name'], keep='first')
        
        # تنظيف القيم
        df['previous'] = df['previous'].replace('', None)
        df['forecast'] = df['forecast'].replace('', None)
        df['actual'] = df['actual'].replace('', None)
        
        # تحويل مستوى الأهمية إلى نص
        df['importance_level'] = df['importance'].map({
            1: 'Low',
            2: 'Medium', 
            3: 'High'
        })
        
        # إضافة عمود التاريخ والوقت مجمعين
        df['datetime'] = df.apply(lambda row: f"{row['date']} {row.get('time', '09:30')}", axis=1)
        
        return df
    
    def save_data(self, df, filename='usd_economic_calendar.csv'):
        """
        حفظ البيانات في ملفات مختلفة
        """
        try:
            # حفظ كـ CSV
            df.to_csv(filename, index=False, encoding='utf-8-sig')
            print(f"Data saved to {filename}")
            
            # حفظ كـ JSON
            json_filename = filename.replace('.csv', '.json')
            df.to_json(json_filename, orient='records', date_format='iso', indent=2)
            print(f"Data also saved to {json_filename}")
            
            # حفظ كـ Excel
            excel_filename = filename.replace('.csv', '.xlsx')
            with pd.ExcelWriter(excel_filename, engine='openpyxl') as writer:
                df.to_excel(writer, sheet_name='Economic Calendar', index=False)
                
                # ورقة منفصلة للأحداث عالية الأهمية
                high_impact = df[df['importance'] == 3]
                if not high_impact.empty:
                    high_impact.to_excel(writer, sheet_name='High Impact Events', index=False)
            
            print(f"Data also saved to {excel_filename}")
            
        except Exception as e:
            print(f"Error saving data: {e}")
    
    def __del__(self):
        """تنظيف الموارد"""
        if hasattr(self, 'driver') and self.driver:
            try:
                self.driver.quit()
            except:
                pass

# مثال على الاستخدام
if __name__ == "__main__":
    # إنشاء مثيل من الـ scraper
    scraper = USDEconomicCalendarScraper(use_selenium=False)  # ابدأ بدون Selenium
    
    try:
        # جمع البيانات من 2020 إلى 2025
        df = scraper.scrape_economic_calendar(start_year=2020, end_year=2025)
        
        if not df.empty:
            # عرض إحصائيات أساسية
            print("\n=== ملخص بيانات الأجندة الاقتصادية للدولار ===")
            print(f"إجمالي الأحداث: {len(df)}")
            print(f"النطاق الزمني: {df['date'].min()} إلى {df['date'].max()}")
            print(f"مستويات الأهمية: {df['importance_level'].value_counts().to_dict()}")
            
            # عرض أهم الأحداث
            print("\n=== أهم 10 أحداث ===")
            display_cols = ['date', 'event_name', 'importance_level', 'actual']
            print(df[display_cols].head(10).to_string(index=False))
            
            # حفظ البيانات
            scraper.save_data(df, 'usd_economic_calendar_2020_2025.csv')
            
            # حفظ بيانات مفلترة (الأحداث عالية التأثير فقط)
            high_impact_df = df[df['importance'] == 3]
            if not high_impact_df.empty:
                scraper.save_data(high_impact_df, 'usd_high_impact_events_2020_2025.csv')
                print(f"تم حفظ الأحداث عالية التأثير: {len(high_impact_df)} حدث")
            
            # إحصائيات إضافية
            print(f"\n=== إحصائيات إضافية ===")
            print(f"الأحداث حسب المصدر:")
            source_counts = df['source'].value_counts()
            for source, count in source_counts.items():
                print(f"  {source}: {count} حدث")
                
        else:
            print("لم يتم العثور على بيانات. يرجى التحقق من الاتصال بالإنترنت والمحاولة مرة أخرى.")
            
    except Exception as e:
        print(f"خطأ في تشغيل البرنامج: {e}")
    
    finally:
        # تنظيف الموارد
        del scraper

Starting USD Economic Calendar scraping from 2020 to 2025
Trying API methods...
Trying alternative sources...
No real data found, creating sample data...
Creating sample USD economic calendar data...
Successfully processed 720 USD economic events

=== ملخص بيانات الأجندة الاقتصادية للدولار ===
إجمالي الأحداث: 720
النطاق الزمني: 2020-01-01 إلى 2025-12-20
مستويات الأهمية: {'Medium': 504, 'High': 216}

=== أهم 10 أحداث ===
      date                             event_name importance_level actual
2020-01-01                      Non-Farm Payrolls             High   0.2%
2020-01-02 Federal Reserve Interest Rate Decision             High   0.3%
2020-01-03             Consumer Price Index (CPI)             High   0.4%
2020-01-04             Producer Price Index (PPI)           Medium   0.5%
2020-01-05                        GDP Growth Rate           Medium   0.6%
2020-01-06                      Unemployment Rate           Medium   0.7%
2020-01-07                           Retail Sales         

In [None]:
يب