In [66]:
import matplotlib.pyplot as plt 
import pickle
import networkx as nx
import re
import os
import json
import os
from nltk.tokenize import word_tokenize
import json
from tqdm import tqdm

In [67]:
# load graph object from file
G = pickle.load(open('G_politicans.pickle', 'rb'))

In [69]:
# Initialize an empty dictionary
word_dict = {}

# Read the file line by line
with open(r'C:\Users\Mathilde\Documents\DTU\E24\SocialGraf\LabMT_wordlist.txt', 'r', encoding='cp1252') as f:
    # Skip the header line
    next(f)
    for line in f:
        # Split the line into components
        parts = line.strip().split('\t')
        word = parts[0]
        happiness_rank = int(parts[1])
        happiness_average = float(parts[2])
        happiness_standard_deviation = float(parts[3])
        twitter_rank = float(parts[4]) if parts[4] != '--' else None
        google_rank = float(parts[5]) if parts[5] != '--' else None
        nyt_rank = float(parts[6]) if parts[6] != '--' else None
        lyrics_rank = float(parts[7]) if parts[7] != '--' else None
      
        # Add the word and its attributes to the dictionary
        word_dict[word] = {
            'happiness_rank': happiness_rank,
            'happiness_average': happiness_average,
            'happiness_standard_deviation': happiness_standard_deviation,
            'twitter_rank': twitter_rank,
            'google_rank': google_rank,
            'nyt_rank': nyt_rank,
            'lyrics_rank': lyrics_rank
        }

In [70]:
# Function to calculate sentiment for a given text
def calculate_sentiment(text):
    tokens = word_tokenize(text.lower())
    sentiment_sum = 0
    word_count = 0
    for token in tokens:
        if token in word_dict:
            sentiment_sum += word_dict[token]['happiness_average']
            word_count += 1
    if word_count > 0:
        return sentiment_sum / word_count
    else:
        return None

In [71]:
from tqdm import tqdm

wiki_pages = "C:/Users/Mathilde/Documents/DTU/E24/SocialGraf/wiki_pages"

for node in tqdm(list(G.nodes()), desc="Processing nodes"):
    node_file = f"{node}.json"
    if os.path.exists(os.path.join(wiki_pages, node_file)):
        with open(os.path.join(wiki_pages, node_file), 'r', encoding="utf-8") as f:
            try:
                data = json.load(f)
                wikitext = data["query"]["pages"]
                page = list(wikitext.keys())[0]
                wikitext = wikitext[page]["revisions"][0]["*"]
                
                # Find party
                match = re.search(r'\bparty\b.*?\|([^]]+)\]\]', wikitext, re.IGNORECASE)
                if match:
                    party = match.group(1)
                    if len(party) < 50:
                        G.nodes[node]['party'] = party
                
                # Find gender
                text = json.dumps(data).lower()
                he_count = len(re.findall(r'\bhe\b|\bhim\b|\bhis\b', text))
                she_count = len(re.findall(r'\bshe\b|\bher\b|\bhers\b', text))
                if he_count > she_count:
                    G.nodes[node]['gender'] = "male"
                elif she_count > he_count:
                    G.nodes[node]['gender'] = "female"
                else:
                    G.nodes[node]['gender'] = "unknown"
                
                # Calculate sentiment
                sentiment = calculate_sentiment(wikitext)
                if sentiment is not None:
                    G.nodes[node]["sentiment"] = sentiment
                else:
                    print(f"Sentiment could not be calculated for {node}")
            except:
                print(f"Error in file: {node_file}")
                continue

Processing nodes: 100%|██████████| 20534/20534 [06:02<00:00, 56.60it/s] 


In [72]:
for node in G.nodes():
    if 'party' in G.nodes[node] and (G.nodes[node]['party'] in ['Democratic', 'Democrat', 'Democratic Party', 'Democratic Party (United States)', 'DFL', 'Democratic (DFL)', 'Democratic-Farmer-Labor', 'Democratic–Farmer–Labor', 'Democratic-Farmer-Labor Party', 'Democratic-Farmer-Labor Party (Minnesota)', 'DFL party', 'Farmer-Labor'] or re.search(r'democrat', G.nodes[node]['party'], re.IGNORECASE)):
        G.nodes[node]['party'] = 'Democratic'

    if 'party' in G.nodes[node] and (G.nodes[node]['party'] in ['Republican', 'Republican Party', 'Republican Party (United States)', 'GOP', 'Texas Republican Party'] or re.search(r'republican', G.nodes[node]['party'], re.IGNORECASE)):
        G.nodes[node]['party'] = 'Republican'

    if 'party' in G.nodes[node] and re.search(r'green', G.nodes[node]['party'], re.IGNORECASE):
        G.nodes[node]['party'] = 'Green'

    if 'party' in G.nodes[node] and re.search(r'libertarian', G.nodes[node]['party'], re.IGNORECASE):
        G.nodes[node]['party'] = 'Libertarian'

    if 'party' in G.nodes[node] and re.search(r'independent', G.nodes[node]['party'], re.IGNORECASE):
        G.nodes[node]['party'] = 'Independent'

    if 'party' in G.nodes[node] and re.search(r'socialist', G.nodes[node]['party'], re.IGNORECASE):
        G.nodes[node]['party'] = 'Socialist'
  

In [73]:
parties = ['Democratic', 'Republican', 'Libertarian', 'Green', 'Independent', 'Socialist']

count_party = 0
no_party = 0
for node in G.nodes():
    if 'party' in G.nodes[node] and G.nodes[node]['party'] not in parties:
        # print(f"Node {node} has a party {G.nodes[node]['party']} not in the list of parties.")
        count_party += 1
    if 'party' not in G.nodes[node]:
        no_party += 1

print(f"Total nodes with parties not in the list: {count_party}")
print(f"Total nodes with no party: {no_party}")

Total nodes with parties not in the list: 349
Total nodes with no party: 971


In [74]:
from collections import Counter

party_counts = Counter(nx.get_node_attributes(G, 'party').values())
print(party_counts)

Counter({'Democratic': 9690, 'Republican': 9278, 'Independent': 135, 'Socialist': 41, 'Libertarian': 39, 'Green': 31, 'Prohibition': 18, 'Progressive': 14, 'Saint Paul': 8, 'Senate Majority Leader': 8, 'Senate Minority Whip': 7, 'Constitution': 6, 'Vermont Progressive': 5, 'House Minority Leader': 5, 'Reform': 4, 'Progressive Party': 4, 'Nonpartisan': 4, 'House Majority Leader': 4, 'Minneapolis': 4, 'Senate Majority Whip': 3, 'Bloomington': 3, "People's Party": 3, 'Anoka County': 3, 'Black Panther': 3, 'Dakota': 3, 'Le Sueur': 3, 'House Minority Whip': 3, 'Hennepin County': 3, 'Working Families': 3, 'Peace and Freedom': 3, 'third-party candidate': 3, 'Dakota County': 3, 'House Committee on the Chinese Communist Party': 3, 'Constitution Party': 3, 'House Majority Whip': 3, 'Chippewa': 2, 'Communist Party': 2, 'Anoka': 2, 'Citizens Party': 2, 'R': 2, 'Populist': 2, 'Forward Party': 2, 'Brooklyn Center': 2, 'Benton': 2, 'Kittson': 2, 'Forward': 2, 'United States President': 2, 'Human Righ

In [75]:
female_count = sum(1 for _, attr in G.nodes(data=True) if attr.get("gender") == "female")
male_count = sum(1 for _, attr in G.nodes(data=True) if attr.get("gender") == "male")
unknown_count = sum(1 for _, attr in G.nodes(data=True) if attr.get("gender") == "unknown")

print(f"Antal kvinder: {female_count}")
print(f"Antal mænd: {male_count}")
print(f"Antal unknown: {unknown_count}")

Antal kvinder: 5543
Antal mænd: 14154
Antal unknown: 837


In [76]:
nodes_to_remove1 = [node for node, attr in G.nodes(data=True) if attr.get("gender") == "unknown"]
nodes_to_remove2 = [node for node in G.nodes() if 'party' not in G.nodes[node] or G.nodes[node]['party'] not in parties]
nodes_to_remove3 = [node for node in G.nodes() if 'sentiment' not in G.nodes[node]]

print(f"removing: {len(nodes_to_remove1) + len(nodes_to_remove2) + len(nodes_to_remove3)}", "nodes")

G.remove_nodes_from(nodes_to_remove1)
G.remove_nodes_from(nodes_to_remove2)
G.remove_nodes_from(nodes_to_remove3)

removing: 2157 nodes


In [80]:
print(len(list(nx.isolates(G))))
list(nx.isolates(G))

376


['Aaron_Griesheimer',
 'Aaron_M._Dana',
 'Abbey_Duke',
 'Aja_Brown',
 'Alan_Frumin',
 'Alexis_Simpson',
 'Alex_Atwood',
 'Alfred_M._Monfalcone',
 'Algernon_Lee_Butler',
 'Alice_Harden',
 'Allen_Peake',
 'Al_DeKruif',
 'Amanda_Mays_Bledsoe',
 'Amos_Pinchot',
 'Andrew_DeGraffenreidt',
 'Andrew_Lang_(Minnesota_politician)',
 'Andrew_Mathews_(politician)',
 'Andy_Berry',
 'Andy_McKenzie',
 'Andy_Smith_(Minnesota_politician)',
 'Ann_Vermilion',
 'Antonio_Giarrusso',
 'Arnold_Mooney',
 'Arthur_E._Reimer',
 'Art_Washut',
 'Ashley_Aune',
 'Ashley_Bland_Manlove',
 'Barb_Yarusso',
 'Bennie_Turner',
 'Ben_Leman',
 'Bernard_Hawkins',
 'Bernie_Perryman',
 'Beryl_Piccolantonio',
 "Beth_O'Connor",
 'Betty_Nuovo',
 'Bev_Scalze',
 'Bill_Allen_(Missouri_politician)',
 'Bill_Lant',
 'Bill_Owen_(politician)',
 'Bobby_Farlice-Rubio',
 'Bob_Lessard',
 'Bradford_Jacobsen',
 'Brad_Banderman',
 'Brad_Montell',
 'Brad_Pollitt',
 'Brad_Tabke',
 'Brian_Honan',
 'Brian_Putnam',
 'Bruce_Grubbs',
 'Caleb_Ness',
 'Ca

In [81]:
G.remove_nodes_from(list(nx.isolates(G)))

In [82]:
len(G.nodes(data=True))

18033

In [83]:
required_attributes = ['party', 'gender', 'sentiment']
missing_attributes = {attr: 0 for attr in required_attributes}

for node, attr in G.nodes(data=True):
    for attribute in required_attributes:
        if attribute not in attr:
            missing_attributes[attribute] += 1

print("Missing attributes count:")
for attribute, count in missing_attributes.items():
    print(f"{attribute}: {count}")


Missing attributes count:
party: 0
gender: 0
sentiment: 0


In [84]:
list(nx.isolates(G))

[]

In [85]:
# save graph object to file
pickle.dump(G, open('G_politicans_final.pickle', 'wb'))