In [None]:
# !pip install requests
# !pip install beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
import json


## Le Parisien

In [None]:
def get_page_contents(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }

    page = requests.get(url, headers=headers)

    if page.status_code == 200:
        return page.text

    return None

def get_quotes_and_authors(page_contents):
  soup = BeautifulSoup(page_contents, 'html.parser')

  script_tag = soup.find('script', type='application/ld+json')

  if script_tag:
      json_data = json.loads(script_tag.string)
      authors = soup.find_all('span', class_='author')
      quotes = soup.find_all('section', class_='content')

      return quotes, authors

  return [], []

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd

file_path = '/content/drive/MyDrive/Thesis/data/Le Parisien.csv'
# data = pd.read_csv(path)

In [None]:
import csv

def process_urls_from_csv(file_path, output_file):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader, None)

        with open(output_file, mode='w', newline='', encoding='utf-8') as output_csv:
            csv_writer = csv.writer(output_csv)

            csv_writer.writerow(['URL', 'Author', 'Quote'])

            for row in reader:
                for url in row:
                    if url:
                        page_contents = get_page_contents(url)
                        if page_contents:
                            quotes, author = get_quotes_and_authors(page_contents)
                            print(f"Processing URL: {url}")
                            print(f"Author: {author.text}")
                            print(f"Found {len(quotes)} quotes")

                            for quote in quotes:
                                csv_writer.writerow([url, author.text, quote])
                                print(f"Wrote quote to CSV: {quote}")
                        else:
                            print(f"Failed to get contents for {url}")


In [None]:
process_urls_from_csv(file_path, 'quotes_output.csv')


## Journal du Cameroun

In [None]:
def get_page_contents(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36'
    }

    page = requests.get(url, headers=headers)

    if page.status_code == 200:
        return page.text

    return None

def get_quotes_and_authors(page_contents):
  soup = BeautifulSoup(page_contents, 'html.parser')

  script_tag = soup.find('script', type='application/ld+json')

  if script_tag:
      json_data = json.loads(script_tag.string)
      author_tag = soup.find('div', class_='post-meta-author')
      authors = author_tag.find('a').get_text(strip=True) if author_tag else "Author not found"
      article_content = soup.find('div', class_='article-content')
      quotes = []
      if article_content:
          quotes += [p.text for p in article_content.find_all('p') if p.get_text(strip=True)]
          quotes += [div.text for div in article_content.find_all('div') if div.get_text(strip=True)]

      return quotes, authors

  return [], []

In [None]:
import csv

def process_urls_from_csv(file_path, output_file):
    with open(file_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        headers = next(reader, None)  # Skip the header row if present

        # Open output CSV file for writing
        with open(output_file, mode='w', newline='', encoding='utf-8') as output_csv:
            csv_writer = csv.writer(output_csv)

            # Write headers to the output file
            csv_writer.writerow(['URL', 'Author', 'Quote'])

            for row in reader:
                for url in row:
                    if url:
                        page_contents = get_page_contents(url)
                        if page_contents:
                            quotes, author = get_quotes_and_authors(page_contents)
                            print(f"Processing URL: {url}")
                            print(f"Author: {author}")
                            print(f"Found {len(quotes)} quotes")

                            for quote in quotes:
                                # Write the data to the CSV
                                csv_writer.writerow([url, author, quote])
                                print(f"Wrote quote to CSV: {quote}")
                        else:
                            print(f"Failed to get contents for {url}")

In [None]:
file_path = '/content/drive/MyDrive/Thesis/data/Journal du Cameroun.csv'
process_urls_from_csv(file_path, 'cameroun_quotes_output.csv')


### Words per csv

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Thesis/data/cameroun_quotes_output.csv")
# print(data.head())

data["Number of Words"] = data["Quote"].apply(lambda n: len(n.split()))
# print(data.head())

total_words = data["Number of Words"].sum()
print("Total number of words in all quotes:", total_words)

In [None]:
data = pd.read_csv("/content/drive/MyDrive/Thesis/data/quotes_output.csv")
# print(data.head())

data["Number of Words"] = data["Quote"].apply(lambda n: len(str(n).split()) if pd.notnull(n) else 0)
# print(data.head())

total_words = data["Number of Words"].sum()
print("Total number of words in all quotes:", total_words)


# Replace Country Names with Placeholder

In [None]:
import re

def replace_country_nationality(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"\b[Cc]ameroun\b", "[COUNTRY]", text)
    text = re.sub(r"\b[Ff]rance\b", "[COUNTRY]", text)
    text = re.sub(r"\b[Cc]amerounais(e)?\b", "[NATIONALITY]", text)
    text = re.sub(r"\b[Ff]rançais(e)?\b", "[NATIONALITY]", text)
    return text

data = pd.read_csv("/content/drive/MyDrive/Thesis/data/quotes_output.csv")
data["Quote"] = data["Quote"].apply(replace_country_nationality)
output_path = '/content/drive/MyDrive/Thesis/data/UPDATED_quotes_output.csv'

data.to_csv(output_path, index=False)

print(f"Tokenized quotes saved to {output_path}")
data = pd.read_csv("/content/drive/MyDrive/Thesis/data/cameroun_quotes_output.csv")
data["Quote"] = data["Quote"].apply(replace_country_nationality)

output_path = '/content/drive/MyDrive/Thesis/data/UPDATED_cameroun_quotes_output.csv'

data.to_csv(output_path, index=False)

print(f"Tokenized quotes saved to {output_path}")

# Tokenize data

In [None]:
nltk.download('all', download_dir='/usr/local/share/nltk_data')

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')

file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_cameroun_quotes_output.csv'
df = pd.read_csv(file_path)

df['tokenized_quote'] = df['Quote'].apply(lambda x: word_tokenize(str(x)))

# output_path = '/content/drive/MyDrive/Thesis/data/cameroun_quotes_tokenized.csv'
output_path = '/content/drive/MyDrive/Thesis/data/UPDATED_cameroun_quotes_tokenized.csv'

df.to_csv(output_path, index=False)

print(f"Tokenized quotes saved to {output_path}")

## Le Parisien

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import nltk

# Download required NLTK data if not already available
nltk.download('punkt')

file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_quotes_output.csv'
df = pd.read_csv(file_path)

# Tokenize the quotes in the 'quote' column
df['tokenized_quote'] = df['Quote'].apply(lambda x: word_tokenize(str(x)))

# Save the updated DataFrame to a new CSV file
# output_path = '/content/drive/MyDrive/Thesis/data/cameroun_quotes_tokenized.csv'
output_path = '/content/drive/MyDrive/Thesis/data/UPDATED_quotes_tokenized.csv'

df.to_csv(output_path, index=False)

print(f"Tokenized quotes saved to {output_path}")


# Statisitics

## Standard French Statistics

In [None]:
from collections import Counter
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('french'))

file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_quotes_tokenized.csv'
df = pd.read_csv(file_path)

df['word_count'] = df['tokenized_quote'].apply(lambda x: len(eval(x)))
total_word_count = df['word_count'].sum()

# Flatten all tokens and filter out stopwords and non-alphabetic tokens
all_words = [word.lower() for tokens in df['tokenized_quote'].apply(eval) for word in tokens if word.isalpha()]
filtered_words = [word for word in all_words if word not in stop_words]

top_words = Counter(filtered_words).most_common(10)
average_word_count = df['word_count'].mean()
unique_words = len(set(filtered_words))

print("Total Word Count:", total_word_count)
print("Average Word Count per Quote:", average_word_count)
print("Top 10 Most Common Words:", top_words)
print("Unique Word Count:", unique_words)


## Cameroonian French Statistics

In [None]:
from collections import Counter
from nltk.corpus import stopwords

# Download stopwords if not already available
nltk.download('stopwords')
stop_words = set(stopwords.words('french'))

# Load the tokenized data
file_path = '/content/drive/MyDrive/Thesis/data/UPDATED_cameroun_quotes_tokenized.csv'
df = pd.read_csv(file_path)

# Word count for each quote
df['word_count'] = df['tokenized_quote'].apply(lambda x: len(eval(x)))

# Total word count
total_word_count = df['word_count'].sum()

# Flatten all tokens and filter out stopwords and non-alphabetic tokens
all_words = [word.lower() for tokens in df['tokenized_quote'].apply(eval) for word in tokens if word.isalpha()]
filtered_words = [word for word in all_words if word not in stop_words]

# Top 10 most common words
top_words = Counter(filtered_words).most_common(10)

# Average word count per quote
average_word_count = df['word_count'].mean()

unique_words = len(set(filtered_words))

print("Total Word Count:", total_word_count)
print("Average Word Count per Quote:", average_word_count)
print("Top 10 Most Common Words:", top_words)
print("Unique Word Count:", unique_words)
