In [1]:
import csv
import os
import pandas as pd
import sys
import string
import nltk
import json
import re
from datetime import datetime
from nltk.tokenize import word_tokenize
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.util import ngrams
from nltk.corpus import stopwords
from pyarabic import araby
from pyarabic.araby import strip_tashkeel
from collections import Counter, defaultdict
import xlsxwriter
import arabic_reshaper
from bidi.algorithm import get_display
import openpyxl
from openpyxl.utils.dataframe import dataframe_to_rows

ModuleNotFoundError: No module named 'pandas'

Initial code that outputs to a text file

In [2]:
def tokenize_and_search_arabic_text(directory_path, output_dir, country_name, newspaper_name, search_terms, max_articles_per_file=500):
    file_count = 1
    article_count = 0
    output_file = None
    output_file_path = lambda count: os.path.join(output_dir, f"{country_name}_{newspaper_name}_search_results_part{count}.txt")
    search_terms_set = set(search_terms)  # Prepare search terms
    
    def is_relevant_file(filename):
        return country_name in filename and newspaper_name in filename and filename.endswith('.csv')

    for file_name in os.listdir(directory_path):
        if is_relevant_file(file_name):
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, mode='r', encoding='utf-8') as file:
                csv_reader = csv.DictReader(file, delimiter='\t')
                print("Processing file:", file_name)

                for row in csv_reader:
                    if output_file is None or article_count >= max_articles_per_file:
                        if output_file is not None:
                            output_file.close()
                        output_file = open(output_file_path(file_count), 'w', encoding='utf-8')
                        file_count += 1
                        article_count = 0

                    text = row.get('Text', '')
                    normalized_text = araby.strip_tashkeel(text)  # Normalize Arabic text
                    tokens = word_tokenize(normalized_text)
                    
                    found_terms = search_terms_set.intersection(tokens)
                    
                    if found_terms:
                        title = row.get('Title', 'No Title Found')
                        output_file.write(f"Title: {title}\nFound Terms: {', '.join(found_terms)}\nText: {text}\n\n")
                        article_count += 1

    if output_file is not None:
        output_file.close()

# Example usage
directory_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Sudan"  # Directory containing CSV files
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Sudan"  # Output directory
country_name = 'Sudan'
newspaper_name = 'almeghar'
search_terms = ['كوفيد', 'وباء', 'فيروس']  # Arabic terms to search for
tokenize_and_search_arabic_text(directory_path, output_dir, country_name, newspaper_name, search_terms)


Processing file: Sudan_almeghar_19.csv
Processing file: Sudan_almeghar_20.csv
Processing file: Sudan_almeghar_21.csv


Include proportion

In [4]:
def tokenize_search_and_context(directory_path, output_dir, country_name, newspaper_name, search_terms, max_articles_per_file=500, window=10):
    file_count = 1
    article_count = 0
    total_article_count = 0
    covid_article_count = 0
    output_file = None
    output_file_path = lambda count: os.path.join(output_dir, f"{country_name}_{newspaper_name}_search_results_part{count}.txt")
    search_terms_set = set(search_terms)  # Prepare search terms
    
    def is_relevant_file(filename):
        return country_name in filename and newspaper_name in filename and filename.endswith('.csv')

    for file_name in os.listdir(directory_path):
        if is_relevant_file(file_name):
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, mode='r', encoding='utf-8') as file:
                csv_reader = csv.DictReader(file, delimiter='\t')
                print("Processing file:", file_name)

                for row in csv_reader:
                    total_article_count += 1
                    if output_file is None or article_count >= max_articles_per_file:
                        if output_file is not None:
                            output_file.close()
                        output_file = open(output_file_path(file_count), 'w', encoding='utf-8')
                        file_count += 1
                        article_count = 0

                    text = row.get('Text', '')
                    normalized_text = text.lower()  # Normalize Arabic text by lowercasing
                    tokens = word_tokenize(normalized_text)
                    
                    found_terms = search_terms_set.intersection(tokens)
                    if found_terms:
                        covid_article_count += 1
                        title = row.get('Title', 'No Title Found')
                        # Find context around each found term
                        for term in found_terms:
                            indices = [i for i, token in enumerate(tokens) if token == term]
                            for index in indices:
                                start = max(0, index - window)
                                end = min(len(tokens), index + window + 1)
                                context = ' '.join(tokens[start:end])
                                output_file.write(f"Title: {title}\nTerm: {term}\nContext: {context}\n\n")
                        article_count += 1

    if output_file is not None:
        output_file.close()

    # Calculate and print the proportion of COVID-19 related articles
    if total_article_count > 0:
        proportion_covid = covid_article_count / total_article_count
        print(f"Total articles: {total_article_count}")
        print(f"COVID-19 related articles: {covid_article_count}")
        print(f"Proportion of COVID-19 related articles: {proportion_covid:.2f}")

# Example usage
directory_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Sudan"  # Directory containing CSV files
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Sudan"  # Output directory
country_name = 'Sudan'
newspaper_name = 'almeghar'
search_terms = ['كوفيد', 'وباء', 'فيروس']  # Arabic terms to search for
tokenize_search_and_context(directory_path, output_dir, country_name, newspaper_name, search_terms)


Processing file: Sudan_almeghar_19.csv
Processing file: Sudan_almeghar_20.csv
Processing file: Sudan_almeghar_21.csv
Total articles: 4053
COVID-19 related articles: 5
Proportion of COVID-19 related articles: 0.00


Write to excel, country newspaper

In [None]:
def tokenize_search_and_context_to_excel(directory_path, output_dir, country_name, newspaper_name, search_terms, max_articles_per_file=500, window=10):
    results = []
    search_terms_set = set(search_terms)  # Prepare search terms
    
    def is_relevant_file(filename):
        return country_name in filename and newspaper_name in filename and filename.endswith('.csv')

    for file_name in os.listdir(directory_path):
        if is_relevant_file(file_name):
            file_path = os.path.join(directory_path, file_name)
            with open(file_path, mode='r', encoding='utf-8') as file:
                csv_reader = csv.DictReader(file, delimiter='\t')
                print("Processing file:", file_name)

                for row in csv_reader:
                    text = row.get('Text', '')
                    normalized_text = text.lower()  # Normalize Arabic text by lowercasing
                    tokens = word_tokenize(normalized_text)
                    
                    found_terms = search_terms_set.intersection(tokens)
                    if found_terms:
                        title = row.get('Title', 'No Title Found')
                        # Find context around each found term
                        for term in found_terms:
                            indices = [i for i, token in enumerate(tokens) if token == term]
                            for index in indices:
                                start = max(0, index - window)
                                end = min(len(tokens), index + window + 1)
                                context = ' '.join(tokens[start:end])
                                results.append({
                                    "Title": title,
                                    "Term": term,
                                    "Context": context,
                                    "Filename": file_name
                                })

    # Create a DataFrame and write to Excel
    df = pd.DataFrame(results)
    output_file_path = os.path.join(output_dir, f"{country_name}_{newspaper_name}_search_results.xlsx")
    df.to_excel(output_file_path, index=False)
    print(f"Results written to {output_file_path}")

# Example usage
directory_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Sudan"
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC"
country_name = 'Sudan'
newspaper_name = 'almeghar'
search_terms = ['كوفيد', 'وباء', 'فيروس']
tokenize_search_and_context_to_excel(directory_path, output_dir, country_name, newspaper_name, search_terms)

Write to excel, whole country

In [6]:
def tokenize_search_and_context_to_excel(country_path, output_dir, search_terms, max_articles_per_file=500, window=10):
    results = []
    search_terms_set = set(search_terms)  # Prepare search terms

    for dirpath, dirnames, filenames in os.walk(country_path):
        for file_name in filenames:
            if file_name.endswith('.csv'):
                file_path = os.path.join(dirpath, file_name)
                with open(file_path, mode='r', encoding='utf-8') as file:
                    csv_reader = csv.DictReader(file, delimiter='\t')
                    print("Processing file:", file_name)

                    for row in csv_reader:
                        text = row.get('Text', '')
                        normalized_text = text.lower()  # Normalize Arabic text by lowercasing
                        tokens = word_tokenize(normalized_text)
                        
                        found_terms = search_terms_set.intersection(tokens)
                        if found_terms:
                            title = row.get('Title', 'No Title Found')
                            # Find context around each found term
                            for term in found_terms:
                                indices = [i for i, token in enumerate(tokens) if token == term]
                                for index in indices:
                                    start = max(0, index - window)
                                    end = min(len(tokens), index + window + 1)
                                    context = ' '.join(tokens[start:end])
                                    results.append({
                                        "Title": title,
                                        "Term": term,
                                        "Context": context,
                                        "Filename": file_name,
                                        "Newspaper": os.path.basename(dirpath)  # Newspaper name from directory
                                    })

    # Create a DataFrame and write to Excel
    df = pd.DataFrame(results)
    output_file_path = os.path.join(output_dir, "search_results.xlsx")
    df.to_excel(output_file_path, index=False)
    print(f"Results written to {output_file_path}")

# Example usage
country_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Sudan"  # Country directory
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC"  # Output directory
search_terms = ['كوفيد', 'وباء', 'فيروس']  # Arabic terms to search for
tokenize_search_and_context_to_excel(country_path, output_dir, search_terms)


Processing file: Sudan_almashhadalsudani_19.csv
Processing file: Sudan_almashhadalsudani_20.csv
Processing file: Sudan_almashhadalsudani_21.csv
Processing file: Sudan_almeghar_19.csv
Processing file: Sudan_almeghar_20.csv
Processing file: Sudan_almeghar_21.csv
Processing file: Sudan_alnilin_19.csv
Processing file: Sudan_alnilin_20.csv
Processing file: Sudan_alnilin_21.csv
Processing file: Sudan_alrakoba_19.csv
Processing file: Sudan_alrakoba_20.csv
Processing file: Sudan_alrakoba_21.csv
Processing file: Sudan_alsadda_19.csv
Processing file: Sudan_alsadda_20.csv
Processing file: Sudan_alsudani_19.csv
Processing file: Sudan_alsudani_21.csv
Processing file: Sudan_assayha_21.csv
Processing file: Sudan_sudantribune_19.csv
Processing file: Sudan_sudantribune_20.csv
Processing file: Sudan_sudantribune_21.csv
Processing file: Sudan_suna_19.csv
Processing file: Sudan_suna_20.csv
Processing file: Sudan_suna_21.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\Ara

Include all headers

In [None]:
def tokenize_search_and_context_to_excel(country_path, output_dir, search_terms, max_articles_per_file=500, window=10):
    results = []
    search_terms_set = set(search_terms)  # Prepare search terms

    def is_relevant_file(filename):
        return filename.endswith('.csv')

    for dirpath, dirnames, filenames in os.walk(country_path):
        for file_name in filenames:
            if is_relevant_file(file_name):
                file_path = os.path.join(dirpath, file_name)
                with open(file_path, mode='r', encoding='utf-8') as file:
                    csv_reader = csv.DictReader(file, delimiter='\t')
                    for row in csv_reader:
                        text = row.get('Text', '').lower()
                        tokens = word_tokenize(text)
                        
                        found_terms = search_terms_set.intersection(tokens)
                        if found_terms:
                            # Collect all metadata from the row
                            context_data = {
                                "Text": row.get('Text', 'No Text Found'),
                                "Title": row.get('Title', 'No Title Found'),
                                "URL": row.get('URL', 'No URL Provided'),
                                "Date": row.get('Date', 'No Date Provided'),
                                "Category": row.get('Category', 'No Category Provided'),
                                "Newspaper": os.path.basename(dirpath),
                                "File_Name": file_name
                            }
                            # Find context around each found term
                            for term in found_terms:
                                indices = [i for i, token in enumerate(tokens) if token == term]
                                for index in indices:
                                    start = max(0, index - window)
                                    end = min(len(tokens), index + window + 1)
                                    context = ' '.join(tokens[start:end])
                                    result = context_data.copy()
                                    result.update({
                                        "Term": term,
                                        "Context": context
                                    })
                                    results.append(result)

    # Create a DataFrame and write to Excel
    df = pd.DataFrame(results)
    output_file_path = os.path.join(output_dir, "Iraq_search_results.xlsx")
    df.to_excel(output_file_path, index=False)
    print(f"Results written to {output_file_path}")

# Example usage
country_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Iraq"  # Directory containing CSV files
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC"  # Output directory
search_terms = ['كوفيد', 'وباء', 'فيروس']  # Arabic terms to search for
tokenize_search_and_context_to_excel(country_path, output_dir, search_terms)


csv

In [11]:
def tokenize_search_and_context_to_csv(country_path, output_dir, search_terms, window=10):
    search_terms_set = set(search_terms)  # Prepare search terms
    results_header = ["Text", "Title", "URL", "Date", "Category", "Newspaper", "File_Name", "Term", "Context"]

    # Prepare the output CSV file
    output_file_path = os.path.join(output_dir, "Iraq_search_results.csv")
    with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=results_header)
        writer.writeheader()

        # Process each file in the directory
        def is_relevant_file(filename):
            print("Processing file:", filename)
            return filename.endswith('.csv')

        for dirpath, dirnames, filenames in os.walk(country_path):
            for file_name in filenames:
                if is_relevant_file(file_name):
                    file_path = os.path.join(dirpath, file_name)
                    with open(file_path, mode='r', encoding='utf-8') as infile:
                        csv_reader = csv.DictReader(infile, delimiter='\t')
                        for row in csv_reader:
                            text = row.get('Text', '').lower()
                            tokens = word_tokenize(text)
                            found_terms = search_terms_set.intersection(tokens)
                            if found_terms:
                                for term in found_terms:
                                    indices = [i for i, token in enumerate(tokens) if token == term]
                                    for index in indices:
                                        start = max(0, index - window)
                                        end = min(len(tokens), index + window + 1)
                                        context = ' '.join(tokens[start:end])
                                        # Collect all metadata from the row and the context info
                                        result = {
                                            "Text": row.get('Text', ''),
                                            "Title": row.get('Title', ''),
                                            "URL": row.get('URL', ''),
                                            "Date": row.get('Date', ''),
                                            "Category": row.get('Category', ''),
                                            "Newspaper": os.path.basename(dirpath),
                                            "File_Name": file_name,
                                            "Term": term,
                                            "Context": context
                                        }
                                        writer.writerow(result)
    print(f"Results written to {output_file_path}")

# Example usage
country_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Iraq"  # Directory containing CSV files
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC"     # Output directory
search_terms = ['كوفيد', 'وباء', 'فيروس']    # Arabic terms to search for
tokenize_search_and_context_to_csv(country_path, output_dir, search_terms)


Processing file: Iraq_albayyna_new_19.csv
Processing file: Iraq_albayyna_new_20.csv
Processing file: Iraq_albayyna_new_21.csv
Processing file: Iraq_alliraqnews_19.csv
Processing file: Iraq_alliraqnews_20.csv
Processing file: Iraq_alliraqnews_21.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\Iraq_search_results.csv


make json*

In [7]:
def generate_detailed_country_newspaper_json(base_path):
    country_newspapers = {}

    for root, dirs, files in os.walk(base_path):
        parts = root.split(os.sep)
        if len(parts) > 1 and parts[-1].startswith("AraNPCC_"):
            country = parts[-1].replace("AraNPCC_", "")
            newspapers = {}
            
            for file_name in files:
                if file_name.endswith('.csv'):
                    # Extract the newspaper name by removing the year and file extension
                    newspaper_name = '_'.join(file_name.split('_')[1:-1])
                    if newspaper_name not in newspapers:
                        newspapers[newspaper_name] = []
                    newspapers[newspaper_name].append(file_name)

            country_newspapers[country] = newspapers
    
    json_path = os.path.join(base_path, 'detailed_country_newspapers.json')
    with open(json_path, 'w') as json_file:
        json.dump(country_newspapers, json_file, indent=4, ensure_ascii=False)
    
    return json_path

# Example usage
base_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC"  # Adjust this path to your directory structure
json_path = generate_detailed_country_newspaper_json(base_path)
print("Detailed JSON created at:", json_path)

Detailed JSON created at: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\detailed_country_newspapers.json


'كوفيد', 'وباء', 'فيروس'

In [17]:
import os
import csv
import json
from nltk.tokenize import word_tokenize

def set_max_csv_field_size():
    max_int_c_long = 2147483647
    try:
        csv.field_size_limit(max_int_c_long)
        print(f"CSV field size limit set to {max_int_c_long}")
    except OverflowError as e:
        print("OverflowError encountered while setting field size limit:", e)

def load_json_reference(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def tokenize_search_and_context_to_csv(json_reference_path, output_dir, country_name, search_terms, window=10):
    set_max_csv_field_size()
    search_terms_set = set(search_terms)
    results_header = ["Text", "Title", "URL", "Date", "Category", "Newspaper", "File_Name", "Term", "Context"]

    # Load JSON reference
    country_newspapers = load_json_reference(json_reference_path)
    newspapers = country_newspapers.get(country_name, {})

    for newspaper, files in newspapers.items():
        output_file_name = f"{country_name}_{newspaper}_search_results.csv"
        output_file_path = os.path.join(output_dir, output_file_name)

        with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=results_header)
            writer.writeheader()
            print(f"Creating file for {newspaper}: {output_file_name}")

            for filename in files:
                file_path = os.path.join(output_dir, f"AraNPCC_{country_name}", filename)
                if not os.path.exists(file_path):
                    print(f"File not found: {file_path}")
                    continue

                with open(file_path, mode='r', encoding='utf-8') as infile:
                    csv_reader = csv.DictReader(infile, delimiter='\t')
                    for row in csv_reader:
                        text = row.get('Text', '').lower()
                        tokens = word_tokenize(text)
                        found_terms = search_terms_set.intersection(tokens)
                        if found_terms:
                            for term in found_terms:
                                indices = [i for i, token in enumerate(tokens) if token == term]
                                for index in indices:
                                    start = max(0, index - window)
                                    end = min(len(tokens), index + window + 1)
                                    context = ' '.join(tokens[start:end])
                                    result = {
                                        "Text": row.get('Text', ''),
                                        "Title": row.get('Title', ''),
                                        "URL": row.get('URL', ''),
                                        "Date": row.get('Date', ''),
                                        "Category": row.get('Category', ''),
                                        "Newspaper": row.get('Newspaper', ''),
                                        "File_Name": filename,
                                        "Term": term,
                                        "Context": context
                                    }
                                    writer.writerow(result)
            print(f"Results written to {output_file_path}")

# Example usage
json_reference_path = r'C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\detailed_country_newspapers.json'
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC"
country_name = 'Yemen'
search_terms = ['كوفيد', 'وباء', 'فيروس']
tokenize_search_and_context_to_csv(json_reference_path, output_dir, country_name, search_terms)


CSV field size limit set to 2147483647
Creating file for adenalghad: Yemen_adenalghad_search_results.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\Yemen_adenalghad_search_results.csv
Creating file for aleshteraki: Yemen_aleshteraki_search_results.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\Yemen_aleshteraki_search_results.csv
Creating file for almashhad: Yemen_almashhad_search_results.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\Yemen_almashhad_search_results.csv
Creating file for almotamar: Yemen_almotamar_search_results.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\Yemen_almotamar_search_results.csv
Creating file for alsahwa: Yemen_alsahwa_search_results.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\Yemen_alsahwa_search_results.csv
Creating file for alwahdawi:

Filter articles relevant to covid 'كوفيد', "كورونا"*

In [12]:
def set_max_csv_field_size():
    max_int_c_long = 2147483647
    try:
        csv.field_size_limit(max_int_c_long)
        print(f"{datetime.now()}: CSV field size limit set to {max_int_c_long}")
    except OverflowError as e:
        print(f"{datetime.now()}: OverflowError encountered while setting field size limit:", e)

def load_json_reference(json_path):
    with open(json_path, 'r', encoding='utf-8') as file:
        return json.load(file)

def tokenize_search_and_context_to_csv(json_reference_path, output_dir, country_name, search_terms, window=10):
    set_max_csv_field_size()
    search_terms_set = set(search_terms)
    results_header = ["Text", "Title", "URL", "Date", "Category", "Newspaper", "File_Name", "Term"]

    # Load JSON reference
    country_newspapers = load_json_reference(json_reference_path)
    newspapers = country_newspapers.get(country_name, {})

    for newspaper, files in newspapers.items():
        output_file_name = f"{country_name}_{newspaper}_search_results.csv"
        output_file_path = os.path.join(output_dir, output_file_name)
        processed_articles = set()

        with open(output_file_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.DictWriter(file, fieldnames=results_header)
            writer.writeheader()
            print(f"{datetime.now()}: Creating file for {newspaper}: {output_file_name}")

            for filename in files:
                file_path = os.path.join(output_dir, f"AraNPCC_{country_name}", filename)
                print(f"{datetime.now()}: Processing file: {file_path}")
                if not os.path.exists(file_path):
                    print(f"{datetime.now()}: File not found: {file_path}")
                    continue

                with open(file_path, mode='r', encoding='utf-8') as infile:
                    csv_reader = csv.DictReader(infile, delimiter='\t')
                    for row in csv_reader:
                        text = row.get('Text', '').lower()
                        tokens = word_tokenize(text)
                        found_terms = search_terms_set.intersection(tokens)
                        
                        article_key = (row.get('Title', ''), row.get('Date', ''), newspaper)
                        if found_terms and article_key not in processed_articles:
                            processed_articles.add(article_key)
                            result = {
                                "Text": text,
                                "Title": row.get('Title', ''),
                                "URL": row.get('URL', ''),
                                "Date": row.get('Date', ''),
                                "Category": row.get('Category', ''),
                                "Newspaper": newspaper,
                                "File_Name": filename,
                                "Term": ", ".join(found_terms)
                            }
                            writer.writerow(result)
            print(f"{datetime.now()}: Results written to {output_file_path}")

# Example usage
json_reference_path = r'C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\detailed_country_newspapers.json'
output_dir = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC"
country_name = 'Egypt'
search_terms = ['كوفيد', "كورونا"]
tokenize_search_and_context_to_csv(json_reference_path, output_dir, country_name, search_terms)

2024-05-12 13:36:11.515539: CSV field size limit set to 2147483647
2024-05-12 13:36:11.517069: Creating file for ahramgate: Egypt_ahramgate_search_results.csv
2024-05-12 13:36:11.517069: Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Egypt\Egypt_ahramgate_19.csv
2024-05-12 13:37:17.111615: Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Egypt\Egypt_ahramgate_20.csv
2024-05-12 13:38:58.213804: Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Egypt\Egypt_ahramgate_21.csv
2024-05-12 13:40:04.143502: Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\Egypt_ahramgate_search_results.csv
2024-05-12 13:40:04.152048: Creating file for akhbarelyomgate: Egypt_akhbarelyomgate_search_results.csv
2024-05-12 13:40:04.152048: Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\AraNPCC_Egypt\Egypt_akhb

clean csv files, date column*

In [13]:
def clean_date(date_str):
    # Strip out unwanted characters [' and '] from the date string
    return date_str.strip("[]'")

def process_files(directory):
    # Loop through all files in the directory
    for filename in os.listdir(directory):
        if filename.endswith('.csv'):
            file_path = os.path.join(directory, filename)
            print(f"{datetime.now()}: Processing file: {file_path}")
            
            # Read the CSV file into a DataFrame
            df = pd.read_csv(file_path)
            
            # Check if 'Date' column exists in the DataFrame
            if 'Date' in df.columns:
                # Apply the cleaning function to the 'Date' column
                df['Date'] = df['Date'].apply(clean_date)
                
                # Save the cleaned DataFrame back to CSV
                df.to_csv(file_path, index=False)
                print(f"{datetime.now()}: Cleaned and saved: {file_path}")
            else:
                print(f"{datetime.now()}: No 'Date' column found in: {file_path}")

# Specify the directory containing your CSV files
directory = r'C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles'
process_files(directory)

2024-05-12 14:54:13.854282: Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_ahramgate_search_results.csv
2024-05-12 14:54:17.816873: Cleaned and saved: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_ahramgate_search_results.csv
2024-05-12 14:54:17.816873: Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_akhbarelyomgate_search_results.csv
2024-05-12 14:54:31.745828: Cleaned and saved: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_akhbarelyomgate_search_results.csv
2024-05-12 14:54:31.746348: Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_alwafd_search_results.csv
2024-05-12 14:54:43.997167: Cleaned and saved: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_alwafd_search_results.csv
2024-05-12

Count articles of filtered dataset

In [3]:
def count_articles(directory):
    """ Count articles per country using directory structure and file names. """
    article_counts = {}
    # Loop over every file in the specified directory
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            # Assuming file names are in the format "Country_newspaper_search_results.csv"
            country = file_name.split('_')[0]  # Extract the country name from the file name
            file_path = os.path.join(directory, file_name)
            print(f"Processing file: {file_path}")
            
            if os.path.exists(file_path):
                try:
                    df = pd.read_csv(file_path)
                    article_count = len(df)
                    if country in article_counts:
                        article_counts[country] += article_count
                    else:
                        article_counts[country] = article_count
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
            else:
                print(f"File not found: {file_path}")

    return article_counts

def save_counts_to_csv(article_counts, output_file):
    """ Save the article counts to a CSV file. """
    df = pd.DataFrame(list(article_counts.items()), columns=['Country', 'ArticleCount'])
    df.to_csv(output_file, index=False)
    print(f"Article counts saved to {output_file}")

# Directory containing the COVID-related article CSV files
directory = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles"

# Path to save the output CSV file with article counts
output_csv_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\covid_article_counts.csv"

# Count articles per country
article_counts = count_articles(directory)

# Save the results to CSV
save_counts_to_csv(article_counts, output_csv_path)

Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_ahramgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_akhbarelyomgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_alwafd_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_elbalad_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_shorouk_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_youm7_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Morocco_ahdathpress_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 -

Count articles of filtered dataset over time and by newspaper*

In [8]:
def parse_date(date_str):
    """Attempt to parse the date with different expected formats."""
    for fmt in ('%d-%m-%Y', '%m-%d-%Y', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    return pd.NaT  # Return Not a Time (NaT) if all formats fail

def process_files(directory):
    results = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            country, newspaper = parse_filename(file_name)
            file_path = os.path.join(directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path)
                # Apply robust date parsing
                df['Date'] = df['Date'].apply(parse_date)
                df_grouped = df.groupby(df['Date'].dt.to_period('D')).size().reset_index(name='ArticleCount')
                df_grouped['Country'] = country
                df_grouped['Newspaper'] = newspaper
                results.append(df_grouped)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    
    final_df = pd.concat(results, ignore_index=True)
    return final_df

def save_results_to_csv(final_df, output_file):
    final_df.to_csv(output_file, index=False)
    print(f"Results written to {output_file}")

directory = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles"
output_csv_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\covid_article_counts_by_date.csv"
final_aggregated_data = process_files(directory)
save_results_to_csv(final_aggregated_data, output_csv_path)


Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_ahramgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_akhbarelyomgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_alwafd_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_elbalad_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_shorouk_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_youm7_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Morocco_ahdathpress_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 -

Specific Keyword frequency

In [2]:
def parse_filename(file_name):
    """ Extract country and newspaper from filename. """
    parts = file_name.split('_')
    country = parts[0]
    newspaper = '_'.join(parts[1:-2])  # Assuming the last part is date or sequence number
    return country, newspaper

def parse_date(date_str):
    """ Attempt to parse the date with different expected formats. """
    for fmt in ('%d-%m-%Y', '%m-%d-%Y', '%Y-%m-%d'):
        try:
            return pd.to_datetime(date_str, format=fmt)
        except ValueError:
            continue
    return pd.NaT  # Return Not a Time (NaT) if all formats fail

def process_files(directory, keywords):
    """ Process each file and count occurrences of keywords. """
    results = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            country, newspaper = parse_filename(file_name)
            file_path = os.path.join(directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path)
                df['Date'] = df['Date'].apply(parse_date)
                
                # Initialize counts for each keyword
                for keyword in keywords:
                    df[keyword] = df['Text'].str.contains(keyword, case=False, na=False)

                # Sum up counts by date
                df_grouped = df.groupby(df['Date'].dt.to_period('D'))[keywords].sum().reset_index()
                df_grouped['Country'] = country
                df_grouped['Newspaper'] = newspaper
                results.append(df_grouped)

            except Exception as e:
                print(f"Error processing {file_path}: {e}")
    
    # Combine all dataframes
    if results:
        final_df = pd.concat(results, ignore_index=True)
        return final_df
    else:
        return pd.DataFrame()  # Return empty dataframe if no results

def save_results_to_csv(final_df, output_file):
    """ Save the aggregated results to a CSV file. """
    final_df.to_csv(output_file, index=False)
    print(f"Results written to {output_file}")

# Directory containing CSV files
directory = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles"
# Output CSV path
output_csv_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\specific_keyword_frequency.csv"
# Keywords to search
keywords = ['كوفيد', 'كورونا', 'جائحة', 'وباء', 'لقاح', 'تباعد', 'عزل', 'حظر', 'تعافي', 'وفاة']

# Process files and count keyword occurrences
final_aggregated_data = process_files(directory, keywords)
# Save the results to a CSV file
save_results_to_csv(final_aggregated_data, output_csv_path)

Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_ahramgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_akhbarelyomgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_alwafd_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_elbalad_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_shorouk_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_youm7_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Morocco_ahdathpress_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 -

Global Keyword frequency

In [10]:
# Function to clean Arabic text
def clean_arabic_text(text):
    text = re.sub(r'[\u064B-\u065F]', '', text)  # Remove Arabic diacritics
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    return text

# Load stop words
def load_stop_words(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_words = file.read().splitlines()
    return set(stop_words)

# Function to calculate keyword frequencies
def calculate_keyword_frequencies(directory, stop_words):
    keyword_frequencies_by_country = {}

    # Iterate through all files in the specified directory
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            file_path = os.path.join(directory, file_name)
            country_name = file_name.split('_')[0]  # Extract country name from the file name
            try:
                df = pd.read_csv(file_path)
                df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
                if country_name not in keyword_frequencies_by_country:
                    keyword_frequencies_by_country[country_name] = {}
                for month, group in df.groupby('Month'):
                    texts = group['Text'].dropna()  # Drop missing values
                    monthly_frequencies = Counter()
                    for text in texts:
                        text = clean_arabic_text(text)
                        tokens = word_tokenize(text)
                        filtered_tokens = [token for token in tokens if token not in stop_words]
                        monthly_frequencies.update(filtered_tokens)
                    keyword_frequencies_by_country[country_name][str(month)] = Counter({word: count for word, count in monthly_frequencies.items() if count >= 10})
            except Exception as e:
                print(f"Failed to process {file_path}: {e}")
    
    return keyword_frequencies_by_country

# Function to save the frequencies to an Excel file
def save_frequencies_to_excel(keyword_frequencies_by_country, output_file):
    workbook = Workbook()
    for country, monthly_frequencies in keyword_frequencies_by_country.items():
        sheet = workbook.create_sheet(title=country)
        sheet.append(['Month', 'Keyword', 'Frequency'])
        for month, frequencies in monthly_frequencies.items():
            for keyword, frequency in frequencies.items():
                sheet.append([month, keyword, frequency])
    
    # Remove the default sheet created by Workbook
    default_sheet = workbook['Sheet']
    workbook.remove(default_sheet)

    workbook.save(output_file)
    print(f"Frequencies saved to {output_file}")

# Usage
directory = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles"  # Directory containing the CSV files
stop_words_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\stop_words.txt"  # Path to the stop words file
output_excel_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\global_keyword_frequencies.xlsx"

stop_words = load_stop_words(stop_words_path)
keyword_frequencies_by_country = calculate_keyword_frequencies(directory, stop_words)
save_frequencies_to_excel(keyword_frequencies_by_country, output_excel_path)

  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.to_period('M')
  df['Month'] = pd.to_datetime(df['Date'], errors='coerce').dt.t

Frequencies saved to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\global_keyword_frequencies.xlsx


attmept at collocates over time

In [20]:
def parse_filename(file_name):
    """ Extract country and newspaper from filename. Assumes format Country_Newspaper_Date.csv """
    parts = file_name.split('_')
    country = parts[0]
    newspaper = parts[1]
    return country, newspaper

def find_collocations(text, keyword, window_size):
    """ Find collocations around a specified keyword within the given window size. """
    text = strip_tashkeel(text)
    tokens = word_tokenize(text)
    collocations = Counter()
    for i, token in enumerate(tokens):
        if token == keyword:
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            window_tokens = tokens[start:i] + tokens[i+1:end]
            for gram in window_tokens:
                collocations[gram] += 1
    return collocations

def process_files(directory, keyword, window_size):
    """ Process each file to find collocations for the specified keyword, aggregated by month. """
    results = []
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            country, newspaper = parse_filename(file_name)
            file_path = os.path.join(directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path)
                df['Country'] = country  # Add country information to DataFrame
                df['Newspaper'] = newspaper  # Add newspaper information to DataFrame
                df['Month'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True).dt.to_period('M')
                grouped = df.groupby(['Month', 'Country', 'Newspaper'])
                for (month, country, newspaper), group in grouped:
                    all_text = ' '.join(group['Text'].dropna())
                    collocations = find_collocations(all_text, keyword, window_size)
                    # Flatten the collocations into columns
                    col_dict = {f"Top {i+1}": f"{word} ({count})" for i, (word, count) in enumerate(collocations.most_common(10))}
                    results.append({
                        'Month': str(month),
                        'Country': country,
                        'Newspaper': newspaper,
                        **col_dict
                    })
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return pd.DataFrame(results)

def save_results_to_csv(results_df, output_file):
    """ Save the results to a CSV file. """
    if results_df.empty:
        print("No data to save. Check the processing steps.")
        return
    
    results_df.to_csv(output_file, index=False, encoding='utf-8')
    print(f"Results written to {output_file}")

# Directory and parameters setup
directory = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test"
keyword = 'فيروس'  # Focus on 'virus' as the keyword
window_size = 1  # Number of words before and after the keyword
output_csv_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\collocation_test.csv"

# Process files and save results
final_results = process_files(directory, keyword, window_size)
save_results_to_csv(final_results, output_csv_path)

Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Morocco_alalam_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_aleshteraki_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_alsahwa_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_alwahdawi_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_saadahpress_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_samaa_search_results.csv
Results written to C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\collocation_test.csv


collocation frequency by country

In [4]:
def parse_filename(file_name):
    """ Extract country from filename. Assumes format Country_Newspaper_Date.csv """
    return file_name.split('_')[0]

def find_collocations(text, keyword, window_size):
    """ Find collocations around a specified keyword within the given window size. """
    text = strip_tashkeel(text)
    tokens = word_tokenize(text)
    collocations = Counter()
    for i, token in enumerate(tokens):
        if token == keyword:
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            window_tokens = tokens[start:i] + tokens[i+1:end]
            for gram in window_tokens:
                collocations[gram] += 1
    return collocations

def process_files(directory, keyword, window_size):
    """ Process each file to find collocations for the specified keyword, summed by country. """
    country_collocations = {}
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            country = parse_filename(file_name)
            file_path = os.path.join(directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path)
                all_text = ' '.join(df['Text'].dropna())
                collocations = find_collocations(all_text, keyword, window_size)
                if country not in country_collocations:
                    country_collocations[country] = Counter()
                country_collocations[country].update(collocations)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return country_collocations

def save_results_to_excel(country_collocations, output_file):
    """ Save the results to an Excel file with each country's collocations on separate sheets. """
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        for country, collocations in country_collocations.items():
            df = pd.DataFrame(collocations.items(), columns=['Collocation', 'Frequency'])
            df.sort_values(by='Frequency', ascending=False, inplace=True)
            df.to_excel(writer, sheet_name=country, index=False)
            print(f"Results for {country} written to sheet in {output_file}")

# Directory and parameters setup
directory = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test"
keyword = 'فيروس'  # Focus on 'virus' as the keyword
window_size = 2  # Number of words before and after the keyword
output_excel_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\collocation_test_2.xlsx"

# Process files and save results
country_collocations = process_files(directory, keyword, window_size)
save_results_to_excel(country_collocations, output_excel_path)

Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Morocco_alalam_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_aleshteraki_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_alsahwa_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_alwahdawi_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_saadahpress_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\test\Yemen_samaa_search_results.csv
Results for Morocco written to sheet in C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\collocation_test_2.xlsx
Results for Yemen written to sh

collocation frequency by country by month

In [11]:
def parse_filename(file_name):
    """ Extract country from filename. Assumes format Country_Newspaper_Date.csv """
    return file_name.split('_')[0]

def clean_token(token, punctuation):
    """ Clean token by removing leading and trailing punctuation. """
    return re.sub(r'^[' + punctuation + ']+|[' + punctuation + ']+$', '', token)

def load_stop_words(file_path):
    """ Load stop words from a file. """
    with open(file_path, 'r', encoding='utf-8') as file:
        stop_words = set(file.read().splitlines())
    return stop_words

def find_collocations(text, keyword, window_size, stop_words):
    """ Find collocations around a specified keyword within the given window size, ignoring punctuation and stop words. """
    text = strip_tashkeel(text)
    tokens = word_tokenize(text)
    collocations = Counter()
    punctuation = string.punctuation + "“”‘’—،–"

    for i, token in enumerate(tokens):
        cleaned_token = clean_token(token, punctuation)
        if cleaned_token == keyword:
            start = max(0, i - window_size)
            end = min(len(tokens), i + window_size + 1)
            # Exclude tokens that are entirely punctuation or stop words
            window_tokens = [clean_token(t, punctuation) for t in tokens[start:i] + tokens[i+1:end] if not all(char in punctuation for char in t) and t not in stop_words]
            for gram in window_tokens:
                if gram:  # Ensure it's not empty after cleaning
                    collocations[gram] += 1
    return collocations

def process_files(directory, keyword, window_size, stop_words):
    """ Process each file to find collocations for the specified keyword, summed by country and month, ignoring punctuation. """
    country_collocations = defaultdict(lambda: defaultdict(Counter))
    for file_name in os.listdir(directory):
        if file_name.endswith('.csv'):
            country = parse_filename(file_name)
            file_path = os.path.join(directory, file_name)
            print(f"Processing file: {file_path}")

            try:
                df = pd.read_csv(file_path)
                df['Month'] = pd.to_datetime(df['Date'], errors='coerce', dayfirst=True).dt.to_period('M')
                for (month), group in df.groupby('Month'):
                    all_text = ' '.join(group['Text'].dropna())
                    collocations = find_collocations(all_text, keyword, window_size, stop_words)
                    country_collocations[country][month].update(collocations)
            except Exception as e:
                print(f"Error processing {file_path}: {e}")

    return country_collocations

def save_results_to_excel(country_collocations, output_file):
    """ Save the results to an Excel file with each country's collocations on separate sheets, organized by month. """
    with pd.ExcelWriter(output_file, engine='xlsxwriter') as writer:
        for country, months_data in country_collocations.items():
            rows = []
            for month, collocations in months_data.items():
                for col, freq in collocations.most_common(10):
                    rows.append({
                        'Month': str(month),
                        'Collocation': col,
                        'Frequency': freq
                    })
            if rows:
                df = pd.DataFrame(rows)
                df.sort_values(by=['Month', 'Frequency'], ascending=[True, False], inplace=True)
                df.to_excel(writer, sheet_name=country, index=False)
                print(f"Results for {country} written to sheet in {output_file}")
            else:
                print(f"No data for {country}.")

# Directory and parameters setup
directory = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles"
keyword = 'فيروس'  # Focus on 'virus' as the keyword
window_size = 5  # Number of words before and after the keyword
output_excel_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\collocations.xlsx"
stop_words_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\stop_words.txt"  # Path to the stop words file

# Load stop words
arabic_stop_words = load_stop_words(stop_words_path)

# Process files and save results
country_collocations = process_files(directory, keyword, window_size, arabic_stop_words)
save_results_to_excel(country_collocations, output_excel_path)

Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_ahramgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_akhbarelyomgate_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_alwafd_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_elbalad_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_shorouk_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Egypt_youm7_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\COVID_Articles\Morocco_ahdathpress_search_results.csv
Processing file: C:\Users\khali\OneDrive\AUS\Classes\7 -

In [None]:
# Function to convert date strings to proper date format
def convert_date_format(date_str):
    try:
        # Convert 'YYYY-MM' to a date format
        return pd.to_datetime(date_str, format='%Y-%m')
    except ValueError:
        return pd.NaT

# Function to process the Excel file and update the date column
def process_excel_file(file_path, sheet_name, date_column):
    # Load the Excel file
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    
    # Apply the date conversion function to the specified column
    df[date_column] = df[date_column].apply(convert_date_format)
    
    # Save the updated DataFrame back to the same Excel file using xlsxwriter
    with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
        df.to_excel(writer, sheet_name=sheet_name, index=False)
    print(f"Dates converted and saved to {file_path}")

# Specify the path to the Excel file, sheet name, and the column to process
file_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\global_keyword_frequencies.xlsx"  # Change to your file path
sheet_name = "Morocco"  # Change to your sheet name
date_column = "Month"   # Change to your date column name

# Process the Excel file
process_excel_file(file_path, sheet_name, date_column)

In [None]:
# Function to convert date strings to proper date format
def convert_date_format(date_str):
    try:
        # Convert 'YYYY-MM' to a date format
        return pd.to_datetime(date_str, format='%Y-%m')
    except ValueError:
        return pd.NaT

# Function to process the Excel file and update the date column
def process_excel_file(file_path, sheet_names, date_column):
    # Create a dictionary to store dataframes for each sheet
    sheets_data = {}

    # Load the Excel file and process each sheet
    for sheet_name in sheet_names:
        try:
            df = pd.read_excel(file_path, sheet_name=sheet_name, engine='openpyxl')
            # Apply the date conversion function to the specified column
            df[date_column] = df[date_column].apply(convert_date_format)
            sheets_data[sheet_name] = df
            print(f"Processed sheet: {sheet_name}")
        except Exception as e:
            print(f"Failed to process sheet {sheet_name}: {e}")

    # Write the updated DataFrames to the same Excel file
    with pd.ExcelWriter(file_path, engine='xlsxwriter') as writer:
        for sheet_name, df in sheets_data.items():
            df.to_excel(writer, sheet_name=sheet_name, index=False)
            print(f"Saved sheet: {sheet_name}")

    print(f"Dates converted and saved to {file_path}")

# Specify the path to the Excel file, sheet names, and the column to process
file_path = r"C:\Users\khali\OneDrive\AUS\Classes\7 - S24\ARA 250\Project\AraNPCC\global_keyword_frequencies.xlsx"  # Change to your file path
sheet_names = ["Egypt", "Morocco", "Yemen"]  # List of sheet names to process
date_column = "Month"   # Change to your date column name

# Process the Excel file
process_excel_file(file_path, sheet_names, date_column)