# PART 1: SENTIMENT ANALYSIS

In [12]:
import pandas as pd
from collections import Counter
import re
from fuzzywuzzy import fuzz
import json
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import numpy as np
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set style for better visualizations
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load the CSV dataset
try:
    df = pd.read_csv('x.csv')
except FileNotFoundError:
    print("Error: 'x.csv' not found. Please provide the correct file path.")
    exit()

# Keyword lists
positive_keywords = [
    'murakoze', 'thank you', 'mwakoze', 'kudufasha', 'power is back', 'resolved', 'gushima',
    'fixed', 'restored', 'solved', 'byakozwe', 'amashanyarazi yagarutse'
]
negative_keywords = [
    'wabuze', 'outage', 'problem', 'issue', 'ikibazo', 'ibibazo', 'mbi',
    'bad service', 'no power', 'low voltage', 'kizima', 'gucikagurika', 'delay',
    'faulty', 'not working', 'frustration', 'ubujura', 'power cut', 'turaheba',
    'biranze', 'birakomeye', 'byagoranye', 'muke', 'byanze', 'ntibikunda',
    'ntibikora', 'biragoye', 'guhagarara', 'byatewe niki', 'isaha ishize',
    'hashize umwanya', 'hari ibura', 'ntacyo bigeze', 'no electricity',
    'without power', 'several hours', 'ntiyakunze', 'gufite ikibazo'
]
neutral_keywords = [
    'ese', 'ryari', 'when', 'where', 'how', 'update', 'question', 'menya', 'kuki',
    'a handi', 'byagenze gute', 'habaye iki', 'bizakemuka', 'ni gute', 'a ho',
    'twegereje', 'turagutegereza', 'kindly assist', 'mwaramutse', 'mwiriwe',
    'turacyategereje', 'please assist', 'what happened', 'panne', 'electricity',
    'cashpower', 'token', 'damaged'
]

# Location and issue lists
locations = [
    'bugesera', 'kirehe', 'kicukiro', 'huye', 'rwamagana', 'kanombe', 'kimironko',
    'musanze', 'nyarugenge', 'rubavu', 'nyamirama', 'rusatira', 'kiruhura', 'gikondo',
    'kabutare', 'nyagahinga', 'rusororo', 'kagarama', 'kibungo', 'kayonza', 'karongi',
    'rubengera', 'muyumbu', 'nyamabuye', 'rwanfro', 'bumbogo', 'musave', 'rugando',
    'nyakabanda', 'nyacyonga', 'kisimenti', 'rubirizi', 'nyundo', 'terimbere', 'keya',
    'gahara', 'murehe', 'dagaza', 'nyamata', 'gasenga', 'ruhuha', 'nyarukombe', 'rubona'
]
issues = [
    'outage', 'low voltage', 'cashpower', 'token', 'umuriro', 'ikibazo', 'ibibazo',
    'power cut', 'transformer', 'technical issue', 'not working', 'damaged', 'delay',
    'kizima', 'gucikagurika', 'faulty', 'ubujura', 'line', 'meter', 'panne'
]

def preprocess_text(text):
    if pd.isna(text) or not isinstance(text, str):
        return ''
    text = re.sub(r'\s+', ' ', text.strip())
    text = re.sub(r'[^\w\s]', ' ', text)
    return text.lower()

def fuzzy_match(keyword, text):
    words = text.split()
    for word in words:
        if fuzz.ratio(keyword, word) > 75:
            return True, keyword
    if keyword in text:
        return True, keyword
    return False, None

def classify_sentiment(text, is_reg_response=False):
    text = preprocess_text(text)
    matched_keywords = []
    if is_reg_response:
        for kw in positive_keywords:
            matched, matched_kw = fuzzy_match(kw, text)
            if matched and not any(fuzzy_match(nkw, text)[0] for nkw in negative_keywords):
                matched_keywords.append(f"positive: {matched_kw}")
                return 'positive', matched_keywords
        for kw in negative_keywords:
            matched, matched_kw = fuzzy_match(kw, text)
            if matched and all(k not in text for k in ['murakoze', 'mwiriwe', 'kugira ngo', 'tukabikurikirana']):
                matched_keywords.append(f"negative: {matched_kw}")
                return 'negative', matched_keywords
        matched_keywords.append('neutral: reg_response')
        return 'neutral', matched_keywords

    for kw in negative_keywords:
        matched, matched_kw = fuzzy_match(kw, text)
        if matched:
            matched_keywords.append(f"negative: {matched_kw}")
            if 'cyane' in text and any(fuzzy_match(nkw, text)[0] for nkw in ['mbi', 'ntibikunda', 'biragoye']):
                matched_keywords.append('negative: mbi cyane')
            return 'negative', matched_keywords

    for kw in positive_keywords:
        matched, matched_kw = fuzzy_match(kw, text)
        if matched and not any(fuzzy_match(nkw, text)[0] for nkw in negative_keywords):
            matched_keywords.append(f"positive: {matched_kw}")
            return 'positive', matched_keywords

    for kw in neutral_keywords:
        matched, matched_kw = fuzzy_match(kw, text)
        if matched:
            matched_keywords.append(f"neutral: {matched_kw}")
            return 'neutral', matched_keywords

    for loc in locations:
        matched, matched_loc = fuzzy_match(loc, text)
        if matched:
            matched_keywords.append(f"neutral: {matched_loc} (location-only)")
            return 'neutral', matched_keywords

    matched_keywords.append('neutral: no match')
    return 'neutral', matched_keywords

# Combine text columns
df['raw_text'] = df['css-1jxf684 5'].fillna('') + ' ' + df['css-1jxf684 4'].fillna('')
df['combined_text'] = df['raw_text'].apply(preprocess_text)

# Remove duplicates
df = df.drop_duplicates(subset=['css-1jxf684 2', 'raw_text', 'css-146c3p1 href'])

# Filter out irrelevant rows
irrelevant_phrases = [
    'show more replies', 'who to follow', 'click to follow', 'official twitter',
    'ministry of', 'rwandair', 'national carrier', 'minisiteri', r'\d{2}/\d{2}/\d{4}',
    r'\[\d{4}-\d{4}-\d{4}-\d{4}-\d{4}\]', 'konteri', 'abdallah eneya',
    r'^\d+\s', r'^\s*@[\w\s]+$'
]
df = df[~df['raw_text'].str.contains('|'.join(irrelevant_phrases), case=False, na=False)]
df['word_count'] = df['raw_text'].apply(lambda x: len(str(x).split()))
df = df[df['word_count'] >= 7]
df['has_content'] = df['combined_text'].apply(
    lambda x: any(fuzzy_match(kw, x)[0] for kw in negative_keywords + positive_keywords + neutral_keywords + locations)
)
df = df[df['has_content']]

# Separate user posts and @reg_rwanda responses
user_posts = df[df['css-1jxf684 2'] != '@reg_rwanda'].copy()
reg_posts = df[df['css-1jxf684 2'] == '@reg_rwanda'].copy()

# Apply sentiment classification
user_posts['sentiment_result'] = user_posts['combined_text'].apply(lambda x: classify_sentiment(x, is_reg_response=False))
user_posts['sentiment'] = user_posts['sentiment_result'].apply(lambda x: x[0])
reg_posts['sentiment_result'] = reg_posts['combined_text'].apply(lambda x: classify_sentiment(x, is_reg_response=True))
reg_posts['sentiment'] = reg_posts['sentiment_result'].apply(lambda x: x[0])

# Count sentiment distribution
user_sentiment_counts = Counter(user_posts['sentiment'])
total_user_posts = sum(user_sentiment_counts.values())
user_sentiment_percentages = {
    sentiment: (count / total_user_posts * 100) if total_user_posts > 0 else 0
    for sentiment, count in user_sentiment_counts.items()
}
reg_sentiment_counts = Counter(reg_posts['sentiment'])
total_reg_posts = sum(reg_sentiment_counts.values())
reg_sentiment_percentages = {
    sentiment: (count / total_reg_posts * 100) if total_reg_posts > 0 else 0
    for sentiment, count in reg_sentiment_counts.items()
}

# Extract locations and issues
location_counts = Counter()
issue_counts = Counter()
for _, row in user_posts.iterrows():
    text = row['combined_text']
    for loc in locations:
        if re.search(r'\b' + re.escape(loc) + r'\b', text, re.IGNORECASE):
            location_counts[loc] += 1
    for issue in issues:
        if re.search(r'\b' + re.escape(issue) + r'\b', text, re.IGNORECASE):
            issue_counts[issue] += 1

# Print results
print("=== Sentiment Analysis Results ===")
print(f"\nTotal User Posts: {total_user_posts}")
print("User Posts Sentiment Distribution:")
for sentiment, count in user_sentiment_counts.items():
    print(f"{sentiment.capitalize()}: {count} posts ({user_sentiment_percentages[sentiment]:.1f}%)")
print(f"\nTotal @reg_rwanda Responses: {total_reg_posts}")
print("@reg_rwanda Responses Sentiment Distribution:")
for sentiment, count in reg_sentiment_counts.items():
    print(f"{sentiment.capitalize()}: {count} posts ({reg_sentiment_percentages[sentiment]:.1f}%)")
print("\nCommon Locations Mentioned:")
for location, count in location_counts.most_common():
    print(f"- {location.capitalize()}: {count} mentions")
print("\nCommon Issues Mentioned:")
for issue, count in issue_counts.most_common():
    print(f"- {issue.capitalize()}: {count} mentions")

# ============== VISUALIZATIONS ==============

# 1. User Sentiment Distribution (Individual Pie Chart)
fig1 = go.Figure()
user_sentiments = list(user_sentiment_counts.keys())
user_values = list(user_sentiment_counts.values())
colors_user = ['#FF6B6B', '#4ECDC4', '#45B7D1']

fig1.add_trace(
    go.Pie(labels=user_sentiments, values=user_values, 
           marker_colors=colors_user, name="User Posts",
           textinfo='label+percent', textposition='inside')
)

fig1.update_layout(
    title='User Posts Sentiment Distribution',
    title_x=0.5,
    height=400,
    font=dict(size=12)
)

# Save as PNG
fig1.write_image('results/user_sentiment_distribution.png', width=800, height=400, scale=2)

# 2. REG Response Sentiment Distribution (Individual Pie Chart)
fig2 = go.Figure()
reg_sentiments = list(reg_sentiment_counts.keys())
reg_values = list(reg_sentiment_counts.values())
colors_reg = ['#96CEB4', '#FECA57', '#FF9FF3']

fig2.add_trace(
    go.Pie(labels=reg_sentiments, values=reg_values,
           marker_colors=colors_reg, name="REG Responses",
           textinfo='label+percent', textposition='inside')
)

fig2.update_layout(
    title='REG Rwanda Response Sentiment',
    title_x=0.5,
    height=400,
    font=dict(size=12)
)

# Save as PNG
fig2.write_image('results/reg_sentiment_distribution.png', width=800, height=400, scale=2)

# 3. Top Locations Mentioned (Individual Bar Chart)
fig3 = go.Figure()
top_locations = location_counts.most_common(10)
if top_locations:
    loc_names = [loc[0].capitalize() for loc in top_locations]
    loc_counts = [loc[1] for loc in top_locations]
    
    fig3.add_trace(
        go.Bar(x=loc_names, y=loc_counts, name="Locations",
               marker_color='#6C5CE7', text=loc_counts, textposition='auto')
    )

fig3.update_layout(
    title='Top Locations Mentioned',
    title_x=0.5,
    xaxis_title='Location',
    yaxis_title='Number of Mentions',
    height=400,
    font=dict(size=12),
    xaxis_tickangle=45
)

# Save as PNG
fig3.write_image('results/top_locations.png', width=800, height=400, scale=2)

# 4. Top Issues Mentioned (Individual Bar Chart)
fig4 = go.Figure()
top_issues = issue_counts.most_common(10)
if top_issues:
    issue_names = [issue[0].capitalize() for issue in top_issues]
    issue_counts_list = [issue[1] for issue in top_issues]
    
    fig4.add_trace(
        go.Bar(x=issue_names, y=issue_counts_list, name="Issues",
               marker_color='#FD79A8', text=issue_counts_list, textposition='auto')
    )

fig4.update_layout(
    title='Top Issues Mentioned',
    title_x=0.5,
    xaxis_title='Issue',
    yaxis_title='Number of Mentions',
    height=400,
    font=dict(size=12),
    xaxis_tickangle=45
)

# Save as PNG
fig4.write_image('results/top_issues.png', width=800, height=400, scale=2)

# 5. Sentiment Comparison Chart
fig5 = go.Figure()

sentiments = ['Positive', 'Negative', 'Neutral']
user_percentages = [user_sentiment_percentages.get(s.lower(), 0) for s in sentiments]
reg_percentages = [reg_sentiment_percentages.get(s.lower(), 0) for s in sentiments]

fig5.add_trace(go.Bar(
    name='User Posts',
    x=sentiments,
    y=user_percentages,
    marker_color='#FF6B6B',
    text=[f'{p:.1f}%' for p in user_percentages],
    textposition='auto'
))

fig5.add_trace(go.Bar(
    name='REG Responses',
    x=sentiments,
    y=reg_percentages,
    marker_color='#4ECDC4',
    text=[f'{p:.1f}%' for p in reg_percentages],
    textposition='auto'
))

fig5.update_layout(
    title='Sentiment Distribution Comparison: User Posts vs REG Responses',
    title_x=0.5,
    xaxis_title='Sentiment',
    yaxis_title='Percentage (%)',
    barmode='group',
    height=500,
    font=dict(size=12)
)

# Save as PNG
fig5.write_image('results/sentiment_comparison.png', width=1000, height=500, scale=2)

# 6. Geographic Sentiment Analysis
if location_counts:
    # Create a more detailed location analysis
    location_sentiment = {}
    for loc in locations:
        loc_posts = user_posts[user_posts['combined_text'].str.contains(loc, case=False, na=False)]
        if len(loc_posts) > 0:
            loc_sentiment_counts = Counter(loc_posts['sentiment'])
            location_sentiment[loc] = {
                'negative': loc_sentiment_counts.get('negative', 0),
                'positive': loc_sentiment_counts.get('positive', 0),
                'neutral': loc_sentiment_counts.get('neutral', 0),
                'total': len(loc_posts)
            }
    
    if location_sentiment:
        loc_df = pd.DataFrame(location_sentiment).T
        loc_df = loc_df.sort_values('total', ascending=False).head(15)
        
        fig6 = go.Figure()
        
        fig6.add_trace(go.Bar(
            name='Negative',
            x=loc_df.index,
            y=loc_df['negative'],
            marker_color='#FF6B6B'
        ))
        
        fig6.add_trace(go.Bar(
            name='Neutral',
            x=loc_df.index,
            y=loc_df['neutral'],
            marker_color='#45B7D1'
        ))
        
        fig6.add_trace(go.Bar(
            name='Positive',
            x=loc_df.index,
            y=loc_df['positive'],
            marker_color='#4ECDC4'
        ))
        
        fig6.update_layout(
            title='Sentiment Distribution by Location',
            title_x=0.5,
            xaxis_title='Location',
            yaxis_title='Number of Posts',
            barmode='stack',
            height=600,
            xaxis={'categoryorder': 'total descending'}
        )
        
        fig6.update_xaxes(tickangle=45)
        
        # Save as PNG
        fig6.write_image('results/sentiment_by_location.png', width=1200, height=600, scale=2)

# 7. Word Cloud for Negative Sentiments (Unchanged, already static)
negative_posts = user_posts[user_posts['sentiment'] == 'negative']
if len(negative_posts) > 0:
    negative_text = ' '.join(negative_posts['combined_text'].tolist())
    
    # Create word cloud
    wordcloud = WordCloud(width=800, height=400, 
                         background_color='white',
                         colormap='Reds',
                         max_words=100).generate(negative_text)
    
    plt.figure(figsize=(12, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud: Negative Sentiment Posts', fontsize=16, fontweight='bold')
    plt.tight_layout()
    plt.savefig('results/negative_wordcloud.png', dpi=300, bbox_inches='tight')
    plt.close()

# 8. Response Effectiveness Analysis
if len(reg_posts) > 0:
    response_effectiveness = {
        'Total User Posts': len(user_posts),
        'Total REG Responses': len(reg_posts),
        'Response Rate': (len(reg_posts) / len(user_posts) * 100) if len(user_posts) > 0 else 0,
        'Positive Responses': reg_sentiment_counts.get('positive', 0),
        'Negative User Posts': user_sentiment_counts.get('negative', 0),
        'Resolution Rate': (reg_sentiment_counts.get('positive', 0) / user_sentiment_counts.get('negative', 1) * 100)
    }
    
    metrics = ['Response Rate (%)', 'Resolution Rate (%)']
    values = [response_effectiveness['Response Rate'], response_effectiveness['Resolution Rate']]
    
    fig7 = go.Figure(go.Bar(
        x=metrics,
        y=values,
        text=[f'{v:.1f}%' for v in values],
        textposition='auto',
        marker_color=['#4ECDC4', '#96CEB4']
    ))
    
    fig7.update_layout(
        title='REG Rwanda Response Effectiveness Metrics',
        title_x=0.5,
        yaxis_title='Percentage (%)',
        height=400,
        font=dict(size=12)
    )
    
    # Save as PNG
    fig7.write_image('results/response_effectiveness.png', width=800, height=400, scale=2)

# 9. Sentiment Timeline (if timestamp data available)
# This would require timestamp data - placeholder for future enhancement
print("\n=== Visualization Summary ===")
print("✓ User Sentiment Distribution (user_sentiment_distribution.png)")
print("✓ REG Response Sentiment Distribution (reg_sentiment_distribution.png)")
print("✓ Top Locations Mentioned (top_locations.png)")
print("✓ Top Issues Mentioned (top_issues.png)")
print("✓ Sentiment Comparison Chart (sentiment_comparison.png)")
print("✓ Geographic Sentiment Analysis (sentiment_by_location.png)")
print("✓ Negative Sentiment Word Cloud (negative_wordcloud.png)")
print("✓ Response Effectiveness Metrics (response_effectiveness.png)")
print("\nAll visualizations have been saved as individual PNG files!")

2025-07-13 21:30:42,707 - INFO - Chromium init'ed with kwargs {}
2025-07-13 21:30:42,714 - INFO - Found chromium path: C:\Program Files\Google\Chrome\Application\chrome.exe
2025-07-13 21:30:42,714 - INFO - Temp directory created: C:\Users\admin\AppData\Local\Temp\tmpajeow_u1.
2025-07-13 21:30:42,717 - INFO - Opening browser.
2025-07-13 21:30:42,718 - INFO - Temp directory created: C:\Users\admin\AppData\Local\Temp\tmpz_bkpxtc.
2025-07-13 21:30:42,718 - INFO - Temporary directory at: C:\Users\admin\AppData\Local\Temp\tmpz_bkpxtc


=== Sentiment Analysis Results ===

Total User Posts: 148
User Posts Sentiment Distribution:
Negative: 107 posts (72.3%)
Neutral: 27 posts (18.2%)
Positive: 14 posts (9.5%)

Total @reg_rwanda Responses: 25
@reg_rwanda Responses Sentiment Distribution:
Neutral: 7 posts (28.0%)
Negative: 9 posts (36.0%)
Positive: 9 posts (36.0%)

Common Locations Mentioned:
- Rubavu: 16 mentions
- Huye: 7 mentions
- Kicukiro: 6 mentions
- Kimironko: 6 mentions
- Bugesera: 5 mentions
- Nyarugenge: 5 mentions
- Musanze: 5 mentions
- Rwamagana: 3 mentions
- Muyumbu: 3 mentions
- Nyarukombe: 3 mentions
- Nyamata: 2 mentions
- Kirehe: 2 mentions
- Kanombe: 2 mentions
- Rubirizi: 2 mentions
- Rubona: 2 mentions
- Gasenga: 1 mentions
- Ruhuha: 1 mentions
- Nyundo: 1 mentions
- Terimbere: 1 mentions
- Keya: 1 mentions
- Bumbogo: 1 mentions
- Musave: 1 mentions
- Rugando: 1 mentions
- Nyamabuye: 1 mentions
- Rusatira: 1 mentions
- Kiruhura: 1 mentions
- Kisimenti: 1 mentions
- Nyamirama: 1 mentions
- Kabutare: 1 

2025-07-13 21:30:43,005 - INFO - Conforming 1 to file:///C:/Users/admin/AppData/Local/Temp/tmpajeow_u1/index.html
2025-07-13 21:30:43,007 - INFO - Waiting on all navigates
2025-07-13 21:30:44,629 - INFO - All navigates done, putting them all in queue.
2025-07-13 21:30:44,632 - INFO - Getting tab from queue (has 1)
2025-07-13 21:30:44,633 - INFO - Got 8083
2025-07-13 21:30:44,634 - INFO - Processing User_Posts_Sentiment_Distribution.png
2025-07-13 21:30:44,635 - INFO - Sending big command for User_Posts_Sentiment_Distribution.png.
2025-07-13 21:30:44,717 - INFO - Sent big command for User_Posts_Sentiment_Distribution.png.
2025-07-13 21:30:44,718 - INFO - Reloading tab 8083 before return.
2025-07-13 21:30:44,812 - INFO - Putting tab 8083 back (queue size: 0).
2025-07-13 21:30:44,813 - INFO - Waiting for all cleanups to finish.
2025-07-13 21:30:44,813 - INFO - Exiting Kaleido
2025-07-13 21:30:44,814 - INFO - TemporaryDirectory.cleanup() worked.
2025-07-13 21:30:44,815 - INFO - shutil.rmtr


=== Visualization Summary ===
✓ User Sentiment Distribution (user_sentiment_distribution.png)
✓ REG Response Sentiment Distribution (reg_sentiment_distribution.png)
✓ Top Locations Mentioned (top_locations.png)
✓ Top Issues Mentioned (top_issues.png)
✓ Sentiment Comparison Chart (sentiment_comparison.png)
✓ Geographic Sentiment Analysis (sentiment_by_location.png)
✓ Negative Sentiment Word Cloud (negative_wordcloud.png)
✓ Response Effectiveness Metrics (response_effectiveness.png)

All visualizations have been saved as individual PNG files!


# PART 2: WEB SCRAPING

In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import random
from datetime import datetime
import logging

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class QuoteScraper:    
    def __init__(self, base_url):
        self.base_url = base_url
        self.session = requests.Session()
        self.scraped_data = []
        
    def get_headers_strategies(self):
        """Return different header strategies to bypass anti-bot measures"""
        return [
            # Strategy 1: Chrome Windows
            {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.9,fr;q=0.8',
                'Accept-Encoding': 'gzip, deflate, br',
                'Connection': 'keep-alive',
                'Upgrade-Insecure-Requests': '1',
                'Sec-Fetch-Dest': 'document',
                'Sec-Fetch-Mode': 'navigate',
                'Sec-Fetch-Site': 'none'
            },
            
            # Strategy 4: With Google Referer
            {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
                'Referer': 'https://www.google.com/',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Language': 'en-US,en;q=0.9'
            },
            
            # Strategy 5: Minimal headers
            {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
            }
        ]
    
    def make_request(self, url, max_retries=3):
        """Make request with multiple strategies and retry logic"""
        
        strategies = self.get_headers_strategies()
        
        for attempt in range(max_retries):
            for i, headers in enumerate(strategies):
                try:
                    logger.info(f"Attempt {attempt + 1}, Strategy {i + 1}: {headers.get('User-Agent', 'Unknown')[:50]}...")
                    
                    # Random delay to avoid rate limiting
                    time.sleep(random.uniform(1, 3))
                    
                    response = self.session.get(
                        url, 
                        headers=headers, 
                        timeout=15,
                        allow_redirects=True
                    )
                    
                    if response.status_code == 200:
                        logger.info(f"✓ Success with Strategy {i + 1}")
                        return response
                    else:
                        logger.warning(f"✗ Strategy {i + 1} failed with status: {response.status_code}")
                        
                except requests.RequestException as e:
                    logger.error(f"✗ Strategy {i + 1} failed with error: {e}")
                    continue
                
            # Wait before retry
            if attempt < max_retries - 1:
                wait_time = random.uniform(5, 10)
                logger.info(f"Waiting {wait_time:.1f} seconds before retry...")
                time.sleep(wait_time)
        
        return None
    
    def extract_quotes(self, soup, url):
        """Extract quotes using multiple selectors and strategies"""
        
        quotes_data = []
        
        # Multiple selector strategies for different quote website structures
        quote_selectors = [
            # Common quote selectors
            'div.quote',
            'div.citation',
            'blockquote',
            'div[class*="quote"]',
            'div[class*="citation"]',
            'p.quote',
            '.quote-text',
            '.citation-text',
            'div.quotation',
            'span.quote',
            # Specific to French quote sites
            'div.citation-contenu',
            'div.citation-texte',
            'p.citation',
            'div[class*="texte"]'
        ]
        
        author_selectors = [
            '.author',
            '.citation-auteur',
            'div.auteur',
            'span.author',
            'div[class*="author"]',
            'div[class*="auteur"]',
            'cite',
            'footer'
        ]
        
        # Try each selector strategy
        for selector in quote_selectors:
            elements = soup.select(selector)
            if elements:
                logger.info(f"Found {len(elements)} elements with selector: {selector}")
                
                for element in elements:
                    quote_text = element.get_text(strip=True)
                    
                    # Filter out very short or very long texts
                    if quote_text and 10 <= len(quote_text) <= 1000:
                        
                        # Try to find author
                        author = self.extract_author(element, author_selectors)
                        if not author:
                            author = "Thomas Gatabazi"  # Default as per original requirement
                        
                        quotes_data.append({
                            'quote': quote_text,
                            'author': author,
                            'source': url,
                            'selector_used': selector,
                            'quote_length': len(quote_text),
                            'scraped_at': datetime.now().isoformat()
                        })
                
                if quotes_data:
                    break  # Use first successful selector
        
        # Fallback: Extract from paragraphs if no quotes found
        if not quotes_data:
            logger.info("No quotes found with standard selectors. Trying paragraph extraction...")
            quotes_data = self.extract_from_paragraphs(soup, url)
        
        return quotes_data
    
    def extract_author(self, element, author_selectors):
        """Extract author information from quote element"""
        
        # Look for author in parent or sibling elements
        for selector in author_selectors:
            # Check within the element
            author_elem = element.select_one(selector)
            if author_elem:
                return author_elem.get_text(strip=True)
            
            # Check in parent element
            parent = element.parent
            if parent:
                author_elem = parent.select_one(selector)
                if author_elem:
                    return author_elem.get_text(strip=True)
            
            # Check in next sibling
            next_sibling = element.find_next_sibling()
            if next_sibling:
                author_elem = next_sibling.select_one(selector)
                if author_elem:
                    return author_elem.get_text(strip=True)
        
        return None
    
    def extract_from_paragraphs(self, soup, url):
        """Fallback method: extract potential quotes from paragraphs"""
        
        quotes_data = []
        paragraphs = soup.find_all('p')
        
        for p in paragraphs:
            text = p.get_text(strip=True)
            
            # Heuristics for identifying quotes
            if (text and 
                20 <= len(text) <= 500 and
                not text.lower().startswith(('copyright', 'tous droits', 'mentions légales')) and
                not any(word in text.lower() for word in ['cookie', 'privacy', 'politique'])):
                
                quotes_data.append({
                    'quote': text,
                    'author': 'Thomas Gatabazi',
                    'source': url,
                    'selector_used': 'paragraph_fallback',
                    'quote_length': len(text),
                    'scraped_at': datetime.now().isoformat()
                })
        
        return quotes_data
    
    def scrape_quotes(self, url):
        """Main scraping method"""
        
        logger.info(f"Starting to scrape: {url}")
        
        # Make request
        response = self.make_request(url)
        
        if not response:
            logger.error("All request strategies failed")
            return []
        
        # Parse HTML
        try:
            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Debug information
            title = soup.title.string if soup.title else 'No title'
            logger.info(f"Page title: {title}")
            logger.info(f"Page length: {len(response.text)} characters")
            
            # Extract quotes
            quotes_data = self.extract_quotes(soup, url)
            
            if quotes_data:
                logger.info(f"Successfully extracted {len(quotes_data)} quotes")
                self.scraped_data.extend(quotes_data)
            else:
                logger.warning("No quotes found on this page")
            
            return quotes_data
            
        except Exception as e:
            logger.error(f"Error parsing response: {e}")
            return []
    
    def save_to_csv(self, filename='results/quotes.csv'):
        """Save scraped data to CSV file"""
        
        if not self.scraped_data:
            logger.warning("No data to save")
            return False
        
        try:
            df = pd.DataFrame(self.scraped_data)
            df.to_csv(filename, index=False, encoding='utf-8')
            logger.info(f"Data saved to {filename}")
            return True
        except Exception as e:
            logger.error(f"Error saving to CSV: {e}")
            return False
    
    def generate_report(self):
        """Generate a comprehensive report of the scraping results"""
        
        if not self.scraped_data:
            logger.warning("No data to generate report")
            return
        
        df = pd.DataFrame(self.scraped_data)
        
        print("\n" + "="*60)
        print("WEB SCRAPING REPORT")
        print("="*60)
        
        print(f"Total quotes scraped: {len(df)}")
        print(f"Unique authors: {df['author'].nunique()}")
        print(f"Average quote length: {df['quote_length'].mean():.1f} characters")
        print(f"Median quote length: {df['quote_length'].median():.1f} characters")
        print(f"Shortest quote: {df['quote_length'].min()} characters")
        print(f"Longest quote: {df['quote_length'].max()} characters")
        
        print(f"\nSelectors used:")
        selector_counts = df['selector_used'].value_counts()
        for selector, count in selector_counts.items():
            print(f"  - {selector}: {count} quotes")
        
        print(f"\nSample quotes:")
        for i, row in df.head(3).iterrows():
            print(f"  {i+1}. \"{row['quote'][:100]}...\" - {row['author']}")
        
        print(f"\nFiles generated:")
        print(f"  - quotes.csv: CSV format")

def main():
    """Main execution function"""
    
    print("="*60)
    print("PART 2: WEB SCRAPING FROM REAL WEBSITE")
    print("="*60)
    
    # Initialize scraper
    url = 'https://citations.ouest-france.fr/citations-thomas-gatabazi-18498.html'
    scraper = QuoteScraper(url)
    
    # Scrape quotes
    quotes = scraper.scrape_quotes(url)
    
    if quotes:
        # Save data in multiple formats
        scraper.save_to_csv('results/quotes.csv')
        
        # Generate comprehensive report
        scraper.generate_report()
        
        print(f"\n✓ Web scraping completed successfully!")
        print(f"✓ {len(quotes)} quotes extracted and saved")
        
    else:
        print("\n✗ Web scraping failed")
        print("\nTroubleshooting suggestions:")
        print("1. Check if the URL is accessible in your browser")
        print("2. Verify the website structure hasn't changed")
        print("3. Try using a VPN if there are geographical restrictions")
        print("4. Check the website's robots.txt file")
        print("5. Consider using Selenium for JavaScript-heavy sites")


if __name__ == "__main__":
    main()

2025-07-13 21:31:03,737 - INFO - TemporaryDirectory.cleanup() worked.
2025-07-13 21:31:03,738 - INFO - shutil.rmtree worked.
2025-07-13 21:31:03,755 - INFO - Starting to scrape: https://citations.ouest-france.fr/citations-thomas-gatabazi-18498.html
2025-07-13 21:31:03,756 - INFO - Attempt 1, Strategy 1: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb...


PART 2: WEB SCRAPING FROM REAL WEBSITE


2025-07-13 21:31:07,456 - INFO - Attempt 1, Strategy 2: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb...
2025-07-13 21:31:10,515 - INFO - Attempt 1, Strategy 3: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWeb...
2025-07-13 21:31:13,207 - INFO - ✓ Success with Strategy 3
2025-07-13 21:31:13,253 - INFO - Page title: Citations célèbres de Thomas Gatabazi, 260+ pensées inspirantes
2025-07-13 21:31:13,255 - INFO - Page length: 118048 characters
2025-07-13 21:31:13,266 - INFO - Found 40 elements with selector: blockquote
2025-07-13 21:31:13,329 - INFO - Successfully extracted 40 quotes
2025-07-13 21:31:13,329 - INFO - Data saved to results/quotes.csv



WEB SCRAPING REPORT
Total quotes scraped: 40
Unique authors: 1
Average quote length: 90.8 characters
Median quote length: 80.5 characters
Shortest quote: 40 characters
Longest quote: 174 characters

Selectors used:
  - blockquote: 40 quotes

Sample quotes:
  1. "Les leçons de la vie se donnent tous les jours mais les moments difficiles permettent de mieux les c..." - Thomas Gatabazi
  2. "Les difficultés de la vie, après les avoir surmontées, deviennent les lampes qui éclairent notre che..." - Thomas Gatabazi
  3. "Les pires moments de la vie enseignent les meilleures leçons de la vie...." - Thomas Gatabazi

Files generated:
  - quotes.csv: CSV format

✓ Web scraping completed successfully!
✓ 40 quotes extracted and saved
