## Imports

In [None]:
import pandas as pd
import numpy as np
import networkx as nx

import spacy
from spacy import displacy

import matplotlib.pyplot as plt


## Spacy model - idk if we are going to use

In [None]:
# spacy model - command used to get it

!python3 -m spacy download en_core_web_sm

In [13]:
# Load spacy English language model
NER = spacy.load("en_core_web_sm")

## Functions

In [89]:
def process_file_to_dataframe(file_path):
    """
    Reads a text file with entries separated by empty lines, where each entry consists
    of key-value pairs separated by a colon, and returns a pandas DataFrame with the parsed data.
    
    Parameters:
    file_path (str): Path to the text file to be processed.
    
    Returns:
    pd.DataFrame: DataFrame containing the parsed key-value pairs from the file.
    """
    # Read the file content
    with open(file_path, 'r') as file:
        content = file.read()

    # Split the content by empty lines to separate entries
    entries = content.strip().split('\n\n')

    # Define a list to hold parsed data
    data = []

    # Process each entry
    for entry in entries:
        entry_dict = {}
        for line in entry.split('\n'):
            # Split each line into key-value pairs
            key, value = line.split(':', 1)
            entry_dict[key.strip()] = value.strip()
        data.append(entry_dict)

    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(data)

    return df

In [90]:
def create_voting_graph(df):
    """
    Creates a voting graph from a DataFrame and visualizes it using PyVis.
    
    Parameters:
    df (pd.DataFrame): DataFrame containing the voting data.
    """

    # Create a directed graph using NetworkX
    G = nx.DiGraph()

    # Add edges to the graph with attributes
    for _, row in df.iterrows():
        # Convert the voter and candidate to strings to ensure compatibility
        voter = str(row['SRC'])
        candidate = str(row['TGT'])
        vote = row['VOT']
        comment = row['TXT']
        
        # Add nodes and edges with attributes
        G.add_node(voter, label=voter)
        G.add_node(candidate, label=candidate)
        G.add_edge(voter, candidate, vote=vote, comment=comment)

    # Convert the NetworkX graph to PyVis
    net = Network(height="750px", width="100%", notebook=True, directed=True, cdn_resources='in_line')
    net.from_nx(G)

    # Customize edges based on vote type
    for edge in net.edges:
        vote = G.edges[edge['from'], edge['to']]['vote']
        edge['color'] = "green" if vote == 1 else "red"
        edge['width'] = 0.5  # Thin edges for simplicity
        edge['title'] = f"Vote: {vote}"

    # Simplified layout and disable physics
    net.set_options("""
    {
      "nodes": {
        "font": {
          "size": 10
        }
      },
      "edges": {
        "smooth": false
      },
      "physics": {
        "enabled": false
      },
      "layout": {
        "improvedLayout": true
      }
    }
    """)

    # Generate and display the network
    net.show("voting_graph.html")

In [98]:
def get_unique_years(df):
    """Returns a list of unique years from the 'YEA' column."""
    # Ensure 'YEA' is numeric and drop rows with invalid years
    df['YEA'] = pd.to_numeric(df['YEA'], errors='coerce')
    df = df.dropna(subset=['YEA'])
    # Get the unique years and return as a list of integers
    election_years = df['YEA'].unique().astype(int).tolist()
    return election_years

def sample_yearly_data(df, election_years, fraction=0.5, random_state=42):
    """Samples 50% of the data for each year and stores them in a dictionary."""
    yearly_data = {}
    for year in election_years:
        # Filter data for the current year
        year_data = df[df['YEA'] == year]
        
        # Sample 50% of the data for the current year
        sampled_data = year_data.sample(frac=fraction, random_state=random_state)
        
        # Store the sampled data
        yearly_data[year] = sampled_data
    return yearly_data

def save_yearly_data(yearly_data):
    """Saves each year's sampled data to a CSV file."""
    for year, data in yearly_data.items():
        data.to_csv(f"voting_data_{year}_sampled.csv", index=False)

## Opening Main File 

In [99]:
# Define the path to the file
file_path = 'Data/ground/wiki-RfA.txt'

df = process_file_to_dataframe(file_path)

# Display the resulting DataFrame
df


Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,"23:13, 19 April 2013",'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,"01:04, 20 April 2013",'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,"23:43, 19 April 2013",'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,"00:11, 20 April 2013",'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,"00:56, 20 April 2013","'''Support''', with great pleasure. I work wit..."
...,...,...,...,...,...,...,...
198270,172,Vancouverguy,1,1,2003,"02:51, 2 Sep 2003",Support
198271,Angela,WhisperToMe,1,1,2003,"23:45, 26 Nov 2003",Support.
198272,Jiang,WhisperToMe,1,1,2003,,Support. --
198273,Pakaran,WhisperToMe,1,1,2003,"05:38, 5 Dec 2003",Support. Age has nothing to do with maturity....


### seeing the number of electing admin years, maybe to divide the set more


In [92]:
election_years = get_unique_years(df)
print("Election years:", election_years)

# Step 2: Sample data by year
yearly_data = sample_yearly_data(df, election_years)

# Step 3: Print the number of rows for each year after sampling
for year, data in yearly_data.items():
    print(f"Year {year} has {len(data)} rows after sampling.")

# Step 4: Save the sampled data to CSV files
save_yearly_data(yearly_data)

Election years: [2013, 2012, 2011, 2010, 2009, 2008, 2007, 2006, 2005, 2004, 2003]
Year 2013 has 1786 rows after sampling.
Year 2012 has 3237 rows after sampling.
Year 2011 has 4278 rows after sampling.
Year 2010 has 6985 rows after sampling.
Year 2009 has 10688 rows after sampling.
Year 2008 has 15240 rows after sampling.
Year 2007 has 20770 rows after sampling.
Year 2006 has 22662 rows after sampling.
Year 2005 has 10120 rows after sampling.
Year 2004 has 3270 rows after sampling.
Year 2003 has 102 rows after sampling.


In [102]:
n2003 = pd.read_csv("voting_data_2003_sampled.csv")

n2003

Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Cyan,Ark30inf,1,1,2003,"04:30, 21 Sep 2003",Support. --
1,Silsor,Andres,1,1,2003,,Support. I especially like his edit summaries.
2,Adam Bishop,Michael Hardy,1,1,2003,"17:15, 3 Dec 2003","Support, I also assumed he was an admin (I gue..."
3,Angela,Evil saltine,1,1,2003,,Support.
4,Next Paige,Dysprosia,1,1,2003,"21:39, 27 Aug 2003","Support. Good work editing, stays NPOV, polit..."
...,...,...,...,...,...,...,...
97,Snoyes,Secretlondon,1,1,2003,"19:29, 9 Nov 2003",Support. --
98,Dori,168...,1,1,2003,,"Support. Weird handle, but not the weirdest I'..."
99,Angela,Bcorr,1,1,2003,"07:24, 10 Dec 2003",Support. Brian has made excellent contribution...
100,Daniel Quinlan,Ugen64,1,1,2003,,Support. I have no reason to believe that uge...


### sampling main set - might change number

In [109]:

# Sample 10% of the DataFrame and reset index
sample_df = df.sample(frac=0.05, random_state=42).reset_index(drop=True)

# Display the sample as a table
sample_df


Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Nobleeagle,Herostratus,1,1,2006,"06:52, 17 June 2006",'''Strong Support''' Not much wrong with him a...
1,S Marshall,MichaelQSchmidt,1,1,2011,"15:28, 29 November 2011",Me too. Michael: I do want you to recuse from...
2,Jake Nelson,Hadal,1,1,2004,,"Support. Good edits, and has been here longer ..."
3,JoshuaZ,Pschemp,1,1,2006,"00:08, 3 April 2006",'''Support'''.
4,Rogerd,Pathoschild,1,1,2005,"18:02, 5 December 2005",'''Support''' good editor--
...,...,...,...,...,...,...,...
9909,John,Ironholds,1,1,2011,"03:56, 2 January 2011",'''Support'''--
9910,Bratsche,Beland,1,1,2005,,'''Support''' Thought he was one.
9911,Bluerasberry,Redrose64,1,1,2011,"19:58, 12 October 2011",'''Support''' I am very happy to support edito...
9912,Gilderien,Dirtlawyer1,1,-1,2013,"13:36, 10 February 2013","'''Support''' per nom.--<span style="""">"


# Graph Analysis and Visualization

In [110]:
sample_df

Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Nobleeagle,Herostratus,1,1,2006,"06:52, 17 June 2006",'''Strong Support''' Not much wrong with him a...
1,S Marshall,MichaelQSchmidt,1,1,2011,"15:28, 29 November 2011",Me too. Michael: I do want you to recuse from...
2,Jake Nelson,Hadal,1,1,2004,,"Support. Good edits, and has been here longer ..."
3,JoshuaZ,Pschemp,1,1,2006,"00:08, 3 April 2006",'''Support'''.
4,Rogerd,Pathoschild,1,1,2005,"18:02, 5 December 2005",'''Support''' good editor--
...,...,...,...,...,...,...,...
9909,John,Ironholds,1,1,2011,"03:56, 2 January 2011",'''Support'''--
9910,Bratsche,Beland,1,1,2005,,'''Support''' Thought he was one.
9911,Bluerasberry,Redrose64,1,1,2011,"19:58, 12 October 2011",'''Support''' I am very happy to support edito...
9912,Gilderien,Dirtlawyer1,1,-1,2013,"13:36, 10 February 2013","'''Support''' per nom.--<span style="""">"


## Simple Graph, no weight

In [114]:
# node positions

create_voting_graph(sample_df)


voting_graph.html
