In [6]:
import pandas as pd
import re
import numpy as np

# Load the cleaned dataset
df = pd.read_excel('part 2 dashboard data.xlsx')

# Function to clean text
def clean_text(text):
    if isinstance(text, str):
        text = re.sub(r'[\x80-\xFF]', '', text)  # Remove special characters
        text = re.sub(r'[^\x00-\x7F]+', ' ', text)  # Remove non-ASCII characters
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces
    return text

# Apply cleaning to title and summary
df['Title'] = df['Title'].apply(clean_text)
df['Summary'] = df['Summary'].apply(clean_text)

# Function to extract country from title or summary
G20_COUNTRIES = [
    "Argentina", "Australia", "Brazil", "Canada", "China", "France", "Germany", "India", "Indonesia",
    "Italy", "Japan", "Mexico", "Russia", "Saudi Arabia", "South Africa", "South Korea", "Turkey", "United Kingdom",
    "United States", "European Union"
]

def extract_country(text):
    if isinstance(text, str):
        for country in G20_COUNTRIES:
            if country.lower() in text.lower():
                return country
    return "Other/Unidentified"

# Assign country based on title and summary
df['Country'] = df.apply(lambda row: extract_country(row['Title']) if row['Country'] == "Unknown" else row['Country'], axis=1)
df['Country'] = df.apply(lambda row: extract_country(row['Summary']) if row['Country'] == "Unknown" else row['Country'], axis=1)

# Updated Category Classification
CATEGORY_MAPPING = {
    "Conflicts & Military Operations": ["war", "invasion", "attack", "military strike", "airstrike", "border clash", "drone strike", "frontline"],
    "Defense Budgets, Sales, Procurement, and In-country Defense-Industrial Base": ["arms deal", "procurement", "weapons sale", "contract", "military purchase", "export", "defense budget", "military spending", "funding", "defense allocation"],
    "Geopolitics, Partnerships, & Alliances": ["alliance", "cooperation", "joint drills", "military pact", "strategic partnership", "bilateral agreement", "NATO", "Quad", "defense strategy", "deterrence", "national security policy"],
    "Weapon Testing, Military Capability/Drills": ["missile test", "nuclear test", "hypersonic", "ballistic missile", "live-fire drills", "military exercises", "capability demonstration"]
}

def classify_category(text):
    if isinstance(text, str):
        text_lower = text.lower()
        for category, keywords in CATEGORY_MAPPING.items():
            if any(keyword in text_lower for keyword in keywords):
                return category
    return "Other/Unclassified"

# Apply classification based on title and summary
df['Category'] = df.apply(lambda row: classify_category(row['Title']) if classify_category(row['Title']) != "Other/Unclassified" else classify_category(row['Summary']), axis=1)

# Stance Analysis Keywords
STANCE_KEYWORDS = {
    "Pro": ["strengthening", "enhancing", "successful", "advancing", "progress", "secured"],
    "Neutral": ["meeting", "announced", "discussed", "held talks", "agreement", "report"],
    "Against": ["criticized", "condemned", "violations", "escalation", "protest", "tensions"]
}

def analyze_stance(text):
    if isinstance(text, str):
        text_lower = text.lower()
        for stance, keywords in STANCE_KEYWORDS.items():
            if any(keyword in text_lower for keyword in keywords):
                return stance
    return "Neutral"

# Apply stance analysis
df['Stance'] = df.apply(lambda row: analyze_stance(row['Title']) if analyze_stance(row['Title']) != "Neutral" else analyze_stance(row['Summary']), axis=1)

# Importance Ranking using TF-IDF or Sentiment Scores (if available)
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['Summary'].fillna(''))
df['Importance_Score'] = np.array(X.sum(axis=1)).flatten()

# Rank articles based on importance
df['Importance_Rank'] = df['Importance_Score'].rank(method='max', ascending=False)

# Save cleaned and classified data
df.to_csv('Data for dasboard.csv', index=False)
print("✅ Data cleaning, categorization, stance analysis, and importance ranking complete. File saved as Defense_Analysis_Cleaned.csv")


✅ Data cleaning, categorization, stance analysis, and importance ranking complete. File saved as Defense_Analysis_Cleaned.csv
