In [1]:
import pandas as pd
from collections import defaultdict

In [2]:
def translate_abbreviations(abbreviations):
    translation_dict = {
        "AUT": "Austria",
        "THA": "Thailand",
        "CHE": "Switzerland",
        "ZAF": "South Africa",
        "ESP": "Spain",
        "ECB": "European Central Bank",
        "ecb": "European Central Bank",
        "OPEC": "Organization of the Petroleum Exporting Countries",
        "Saudi": "Saudi Arabia",
        "France": "France",
        "Argentina": "Argentina",
        "Australia": "Australia",
        "OECD": "Organisation for Economic Co-operation and Development",
        "oecd": "Organisation for Economic Co-operation and Development",
        "Spain": "Spain",
        "EC": "European Commission",
        "Italy": "Italy",
        "Brazil": "Brazil",
        "Netherlands": "Netherlands",
        "Austria": "Austria",
        "japan": "Japan",
        "Switzerland": "Switzerland",
        "SA": "South Africa",
        "Germany": "Germany",
        "UAE": "United Arab Emirates",
        "uae": "United Arab Emirates",
        "Gabon": "Gabon",
        "gab": "Gabon",
        "GAB": "Gabon",
        "China": "China",
        "UK": "United Kingdom",
        "Russia": "Russia",
        "india": "India",
        "Thailand": "Thailand",
        "Algeria": "Algeria",
        "EUcouncil": "European Council",
        "Norway": "Norway",
        "usa": "United States",
        "Ghana": "Ghana",
        "Djibouti": "Djibouti",
        "djibouti-fr": "Djibouti (French)",
        "Saudi-Arabia": "Saudi Arabia",
        "eu-council-of-economic-and-finance-ministers": "Council of Economic and Finance Ministers (EU)",
        "Chile": "Chile",
        "european-commission": "European Commission",
        "ec": "European Commission",
        "chl": "Chile",
        "Canada": "Canada",
        "ILO": "International Labour Organization",
        "ilo": "International Labour Organization",
        "United-Kingdom": "United Kingdom",
        "Switzterland": "Switzerland",
        "Finland": "Finland",
        "India": "India",
        "united-nations": "United Nations",
        "Japan": "Japan",
        "Belgium": "Belgium",
        "Singapore": "Singapore",
        "united-states": "United States",
        "Nigeria": "Nigeria",
        "united-arab-emirates": "United Arab Emirates",
        "saudi-arabia": "Saudi Arabia",
        "Korea": "Republic of Korea",
        "cote-divoire": "Côte d'Ivoire",
        "civ-f": "Côte d'Ivoire (French)",
        "Sweden": "Sweden",
        "canada-f": "Canada (French)",
        "UNCTAD": "United Nations Conference on Trade and Development",
        "unctad": "United Nations Conference on Trade and Development",
        "kingdom-of-the-netherlands": "Kingdom of the Netherlands",
        "Hungary": "Hungary",
        "Indonesia": "Indonesia",
        "peoples-republic-of-china": "China",
        "republic-of-korea": "Republic of Korea",
        "democratic-republic-of-the-congo": "Democratic Republic of the Congo",
        "south-africa": "South Africa",
        "Malaysia": "Malaysia",
        "Denmark": "Denmark",
        "UN": "United Nations",
        "un": "United Nations",
        "tur": "Turkey",
        "sau": "Saudi Arabia",
        "jpn": "Japan",
        "dza": "Algeria",
        "DZA": "Algeria",
        "can": "Canada",
        "aus": "Australia",
        "ita": "Italy",
        "ITA": "Italy",
        "bra": "Brazil",
        "arg": "Argentina",
        "ARG": "Argentina",
        "nld": "Netherlands",
        "NLD": "Netherlands",
        "fra": "France",
        "FRA": "France",
        "che": "Switzerland",
        "zaf": "South Africa",
        "cog": "Democratic Republic of the Congo",
        "nor": "Norway",
        "esp": "Spain",
        "deu": "Germany",
        "rus": "Russia",
        "ind": "India",
        "IND": "India",
        "gbr": "United Kingdom",
        "tha": "Thailand",
        "NGA": "Nigeria",
        "ARE": "United Arab Emirates",
        "TUR": "Turkey",
        "JPN": "Japan",
        "wto": "World Trade Organization",
        "opec": "Organization of the Petroleum Exporting Countries",
        "BRA": "Brazil",
        "BEL": "Belgium",
        "bel": "Belgium",
        "AUS": "Australia",
        "GBR": "United Kingdom",
        "united-kingdom": "United Kingdom",
        "ECOFIN": "Economic and Financial Affairs Council",
        "ecofin": "Economic and Financial Affairs Council",
        "FIN": "Finland",
        "USA": "United States",
        "CAN": "Canada",
        "DEU": "Germany",
        "CHN": "China",
        "are": "United Arab Emirates",
        "WTO": "World Trade Organization",
        "SWE": "Sweden",
        "swe": "Sweden",        
        "KOR": "Republic of Korea",
        "kor": "Republic of Korea",
        "COD": "Democratic Republic of the Congo",
        "cod": "Democratic Republic of the Congo",
        "CZE": "Czech Republic",
        "cze": "Czech Republic",
        "NGA": "Nigeria",
        "nga": "Nigeria",
        "PER": "Peru",
        "per": "Peru",
        "IDN": "Indonesia",
        "idn": "Indonesia",
        "CHN": "China",
        "chn": "China",
        "EST": "Estonia",
        "est": "Estonia",
        "CMR": "Cameroon",
        "cmr": "Cameroon",
        "SGP": "Singapore",
        "sgp": "Singapore",
        "BFA": "Burkina Faso",
        "bfa": "Burkina Faso",
        "DNK": "Denmark",
        "dnk": "Denmark",
        "MYS": "Malaysia",
        "mys": "Malaysia",
        "FSB": "Financial Stability Board",
        "fsb": "Financial Stability Board",
        "LTU": "Lithuania",
        "WB": "World Bank",
        "wb": "World Bank",
        "IMF": "International Monetary Fund",
        "imf": "International Monetary Fund",
        "EU": "European Union",
        "eu": "European Union",
        "fin": "Finland",
        "col": "Colombia",
        "fsf": "Financial Stability Forum",
        "imfe":"International Monetary and Financial Committee",
        "uaee": "United Arab Emirates",
        "gbre": "United Kingdom",
        "inde": "India",
        "ause": "Australia",
        "thae": "Thailand",
        "deue": "Germany",
        "nore": "Norway",
        "cane": "Canada",
        "ruse": "Russia",
        "chee": "Switzerland",
        "bele": "Belgium",
        "usae": "United States",
        "itae": "Italy",
        "jpne": "Japan",
        "wbe": "World Bank",
        "eue": "European Union"
    }
    
    return [translation_dict.get(abbr, abbr) for abbr in abbreviations]




In [3]:
constituencies = {
    "AE": ["Angola", "Botswana", "Burundi", "Eritrea", "Ethiopia", "The Gambia", "Kenya", "Lesotho", "Liberia", "Malawi", "Mozambique", "Namibia", "Nigeria", "Sierra Leone", "Somalia", "South Africa", "South Sudan", "Sudan", "Eswatini", "Tanzania", "Uganda", "Zambia", "Zimbabwe"],
    "AF": ["Benin", "Burkina Faso", "Cameroon", "C.A.R.", "Chad", "Comoros", "D.R. Congo","Democratic Republic of the Congo", "Rep. Congo", "Côte d'Ivoire", "Djibouti", "Equatorial Guinea", "Gabon", "Guinea", "Guinea Bissau", "Madagascar", "Mali", "Mauritania", "Mauritius", "Niger", "Rwanda", "São Tomé & Príncipe", "Senegal", "Togo"],
    "AG": ["Argentina", "Bolivia", "Chile", "Paraguay", "Peru", "Uruguay"],
    "AP": ["Australia", "Kiribati", "Korea","Republic of Korea", "Marshall Islands", "Federated States of Micronesia", "Mongolia", "Nauru", "New Zealand", "Palau", "Papua New Guinea", "Samoa", "Seychelles", "Solomon Islands", "Tuvalu", "Vanuatu"],
    "BR": ["Brazil", "Cabo Verde", "Dominican Republic", "Ecuador", "Guyana", "Haiti", "Nicaragua", "Panama", "Suriname", "Timor-Leste", "Trinidad and Tobago"],
    "CC": ["China"],
    "CE": ["Colombia", "Costa Rica", "El Salvador", "Guatemala", "Honduras", "Mexico", "Spain", "República Bolivariana de Venezuela"],
    "CO": ["Antigua and Barbuda", "The Bahamas", "Barbados", "Belize", "Canada", "Dominica", "Grenada", "Ireland", "Jamaica", "St. Kitts and Nevis", "St. Lucia", "St. Vincent and the Grenadines"],
    "EC": ["Austria", "Belarus", "Czech Republic", "Hungary", "Kosovo", "Slovak Republic", "Slovenia", "Turkey"],
    "FF": ["France"],
    "GR": ["Germany"],
    "IN": ["Bangladesh", "Bhutan", "India", "Sri Lanka"],
    "IT": ["Albania", "Greece", "Italy", "Malta", "Portugal", "San Marino"],
    "JA": ["Japan"],
    "MD": ["Afghanistan", "Algeria", "Ghana", "Islamic Republic of Iran", "Libya", "Morocco", "Pakistan", "Tunisia"],
    "MI": ["Bahrain", "Egypt", "Iraq", "Jordan", "Kuwait", "Lebanon", "Maldives", "Oman", "Qatar", "United Arab Emirates",'Uae', "Republic of Yemen"],
    "NE": ["Andorra", "Armenia", "Belgium", "Bosnia and Herzegovina", "Bulgaria", "Croatia", "Cyprus", "Georgia", "Israel", "Luxembourg", "Moldova", "Montenegro", "Netherlands", "Republic of North Macedonia", "Romania", "Ukraine"],
    "NO": ["Denmark", "Estonia", "Finland", "Iceland", "Latvia", "Lithuania", "Norway", "Sweden"],
    "RU": ["Russia", "Syrian Arab Republic"],
    "SA": ["Saudi Arabia"],
    "ST": ["Brunei Darussalam", "Cambodia", "Fiji", "Indonesia", "Lao People's Democratic Republic", "Malaysia", "Myanmar", "Nepal", "Philippines", "Singapore", "Thailand", "Tonga", "Vietnam"],
    "SZ": ["Azerbaijan", "Kazakhstan", "Kyrgyz Republic", "Poland", "Serbia", "Switzerland", "Tajikistan", "Turkmenistan", "Uzbekistan"],
    "UK": ["United Kingdom",'UK','Uk'],
    "US": ["United States"]

}

def assign_constituency(country_name):
    country_name = country_name.strip()
    for constituency, countries in constituencies.items():
        if country_name in countries:
            return constituency
    return "OBS"


In [10]:
data=pd.read_csv("csvs//processed_data.csv").drop(columns=['File_Path'])
data['Region/Authority']=data['Link'].str.extract(r'/([^/]+)(?=\.pdf)')
data['Region/Authority']=data['Region/Authority'].apply(lambda x: translate_abbreviations([x])[0])
data['Constituency']=data['Region/Authority'].apply(assign_constituency)
data = data.drop(columns=['Link'])

In [11]:
data.to_csv('csvs\\Cleaned_constituency_data.csv',index=False)

In [7]:
import plotly.express as px

# Load your dataset (replace 'your_file.csv' with the actual file path)
df = data.copy()
df=df[df['Constituency']!='OBS']

# Group by 'Constituency' and count the number of rows
grouped_df = df['Constituency'].value_counts().reset_index()
grouped_df.columns = ['Constituency', 'Count']

# Create a bar chart
fig = px.bar(grouped_df, x='Constituency', y='Count', title='Rows Grouped by Constituency', 
             labels={'Constituency': 'Constituency', 'Count': 'Number of Rows'},
             color='Count')

# Show the plot
fig.show()

In [12]:
# Function to count words in Extracted_Text
df['Word_Count'] = df['Extracted_Text'].astype(str).apply(lambda x: len(x.split()))

agg_df = df.groupby('Constituency')['Word_Count'].agg(['mean', 'max', 'min']).reset_index()

fig2 = px.bar(agg_df, x='Constituency', y=['mean', 'max', 'min'], 
              title='Word Count Stats by Constituency',
              labels={'value': 'Word Count', 'variable': 'Statistic'},
              barmode='group')

fig2.show()