In [1]:
import pandas as pd
import numpy as np
import networkx as nx

import spacy
from spacy import displacy

import matplotlib.pyplot as plt


### Preprocessing

In [9]:
file_path = 'Data/ground/wiki-RfA.txt'

output_file = 'Data/wiki_RfA_2010_2013.csv'


def process_file_to_dataframe(file_path):
    # Read the text file as lines
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split the content by empty lines
    entries = content.strip().split('\n\n')

    # List to store the entries in a structured format
    data = []

    # Process each entry
    for entry in entries:
        entry_dict = {}
        for line in entry.split('\n'):
            if ":" in line:
                key, value = line.split(":", 1)
                entry_dict[key.strip()] = value.strip()
        data.append(entry_dict)

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(data)

# Process and filter the data by year (2010 to 2013)
def filter_data_by_year(df):
    # Convert 'YEA' column to numeric, errors='coerce' will convert non-numeric to NaN
    df['YEA'] = pd.to_numeric(df['YEA'], errors='coerce')

    # Filter the DataFrame for years between 2010 and 2013
    filtered_df = df[(df['YEA'] >= 2010) & (df['YEA'] <= 2013)]

    return filtered_df

# Write the filtered data to a CSV file
def write_csv_file(df, output_file):
    df.to_csv(output_file, index=False, encoding='utf-8')


Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,"23:13, 19 April 2013",'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,"01:04, 20 April 2013",'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,"23:43, 19 April 2013",'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,"00:11, 20 April 2013",'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,"00:56, 20 April 2013","'''Support''', with great pleasure. I work wit..."
...,...,...,...,...,...,...,...
198270,172,Vancouverguy,1,1,2003,"02:51, 2 Sep 2003",Support
198271,Angela,WhisperToMe,1,1,2003,"23:45, 26 Nov 2003",Support.
198272,Jiang,WhisperToMe,1,1,2003,,Support. --
198273,Pakaran,WhisperToMe,1,1,2003,"05:38, 5 Dec 2003",Support. Age has nothing to do with maturity....


In [19]:
# Main processing pipeline
df = process_file_to_dataframe(file_path)  # Process the file into a DataFrame
filtered_df = filter_data_by_year(df)      # Filter data for 2010-2013
write_csv_file(filtered_df, output_file)   # Write the filtered data to CSV

print(f"CSV file with data from 2010 to 2013 has been written to: {output_file}")

CSV file with data from 2010 to 2013 has been written to: Data/wiki_RfA_2010_2013.csv


In [24]:
rfaSet = 'Data/wiki_RfA_2010_2013.csv'


df = pd.read_csv(rfaSet)
print("Columns in DataFrame:", df.columns)


Columns in DataFrame: Index(['SRC', 'TGT', 'VOT', 'RES', 'YEA', 'DAT', 'TXT'], dtype='object')


In [25]:
df

Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,2013-04-19 23:13:00,'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,2013-04-20 01:04:00,'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,2013-04-19 23:43:00,'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,2013-04-20 00:11:00,'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,2013-04-20 00:56:00,"'''Support''', with great pleasure. I work wit..."
...,...,...,...,...,...,...,...
32305,Atama,ZooPro,-1,-1,2010,2010-02-22 18:17:00,"'''Oppose''' - Per Polargeo, and per [http://e..."
32306,Bradjamesbrown,ZooPro,-1,-1,2010,2010-02-22 18:18:00,'''Oppose''' per SilkTork's diff above. Assert...
32307,Ottawa4ever,ZooPro,0,-1,2010,2010-02-22 18:11:00,"'''Neutral''' Not to pile on, neutral. I canno..."
32308,Tryptofish,ZooPro,0,-1,2010,2010-02-22 17:58:00,'''Neutral''' I've interacted with this editor...


#### Treating time DAT

removing edges with not time 

In [22]:
# Convert the 'DAT' column to datetime, invalid parsing will result in NaT
df['DAT'] = pd.to_datetime(df['DAT'], format='%H:%M, %d %B %Y', errors='coerce')

# Filter rows where 'DAT' is NaT
nat_rows = df[df['DAT'].isna()]

# Display the rows with NaT in the 'DAT' column
print(nat_rows)


               SRC                TGT  VOT  RES   YEA DAT  \
707            NaN        Jason Quinn    0    1  2013 NaT   
708            NaN        Jason Quinn    0    1  2013 NaT   
793            NaN            Legoktm    1    1  2013 NaT   
969    Majoreditor          Lord Roem    1    1  2013 NaT   
1126           NaN      Mattythewhite   -1    1  2013 NaT   
...            ...                ...  ...  ...   ...  ..   
32310      Davidwr           Venomcuz   -1   -1  2010 NaT   
32346       Begoon      White Shadows   -1   -1  2010 NaT   
32350          NaN      White Shadows   -1   -1  2010 NaT   
32394          NaN         WikiCopter   -1   -1  2010 NaT   
32522          NaN  William S. Saturn   -1   -1  2010 NaT   

                                                     TXT  
707                                                  NaN  
708                                                  NaN  
793                                                  NaN  
969    '''Support'''. The candi

out of 32522 we found 262 edges with invalid/ NaN time - removing them for a better analysis and inegration of the network

#### removing those DAT timeless edges - RUN ONLY ONCE

In [23]:

# Read the CSV file
df = pd.read_csv(rfaSet)

# Convert 'DAT' to datetime, coercing errors to NaT (invalid datetimes)
df['DAT'] = pd.to_datetime(df['DAT'], format='%H:%M, %d %B %Y', errors='coerce')

na_count = df['DAT'].isna().sum()
print(f"Number of rows with NA in DAT: {na_count}")

# Remove rows where DAT is NaT (missing or invalid datetime)
df_cleaned = df.dropna(subset=['DAT'])


na_count = df_cleaned['DAT'].isna().sum()
print(f"Number of rows with NA in DAT: {na_count}")

# Print the cleaned dataframe
write_csv_file(df_cleaned, output_file)  

#preprocessing DONE


Number of rows with NA in DAT: 262
Number of rows with NA in DAT: 0


##### PRE PROCESSING DONE - JUST USE THE CLEANED CVS FILE FROM NOW ON

In [31]:
df = pd.read_csv("./Data/wiki_RfA_2010_2013.csv")
df

Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,2013-04-19 23:13:00,'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,2013-04-20 01:04:00,'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,2013-04-19 23:43:00,'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,2013-04-20 00:11:00,'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,2013-04-20 00:56:00,"'''Support''', with great pleasure. I work wit..."
...,...,...,...,...,...,...,...
32305,Atama,ZooPro,-1,-1,2010,2010-02-22 18:17:00,"'''Oppose''' - Per Polargeo, and per [http://e..."
32306,Bradjamesbrown,ZooPro,-1,-1,2010,2010-02-22 18:18:00,'''Oppose''' per SilkTork's diff above. Assert...
32307,Ottawa4ever,ZooPro,0,-1,2010,2010-02-22 18:11:00,"'''Neutral''' Not to pile on, neutral. I canno..."
32308,Tryptofish,ZooPro,0,-1,2010,2010-02-22 17:58:00,'''Neutral''' I've interacted with this editor...


In [32]:

# Assuming df is the DataFrame that you've already filtered and processed
# Create a signed graph
def create_signed_graph(df):
    """
    Create a MultiDiGraph from a DataFrame with edges labeled by interaction type.

    Parameters:
        df (pd.DataFrame): DataFrame containing columns for edges and attributes.

    Returns:
        G (nx.MultiDiGraph): The created graph.
    """
    G = nx.MultiDiGraph()  # Use MultiDiGraph to allow multiple edges between nodes

    # Iterate through the rows of the DataFrame to add edges and nodes
    for _, row in df.iterrows():
        src = row['SRC']  # Source node
        tgt = row['TGT']  # Target node
        vot = row['VOT']  # Label: -1, 1, or 0
        txt = row['TXT']  # Additional textual information
        res = row['RES']  # Admin status: 1 for admin, 0 for nonAdmin
        dat = row['DAT']  # Date or timestamp attribute

        # Add nodes if they don't already exist
        if src not in G:
            G.add_node(src)
        if tgt not in G:
            G.add_node(tgt)

        # Determine admin status for the edge
        admin_status = "admin" if res == 1 else "nonAdmin"

        # Add edge with attributes, storing `VOT` as a label, not a weight
        edge_attrs = {
            'label': vot,   # Interaction type (-1, 0, 1)
            'txt': txt,     # Additional text data
            'admin': admin_status,  # Whether the interaction involves an admin
            'DAT': dat      # Timestamp or date
        }
        G.add_edge(src, tgt, **edge_attrs)  # Add the edge with all attributes
    
    return G

# Create the signed graph
G = create_signed_graph(df_cleaned)

### Graph Characteristics


##### Number of Nodes

In [38]:
G.number_of_nodes()

2968

##### Number of Edges

In [37]:
G.number_of_edges()

32310

#### Max Degree - TOP 20

In [29]:
import networkx as nx

# Assuming G is your directed graph

# Get the in-degree and out-degree of all nodes
in_degree_dict = dict(G.in_degree())
out_degree_dict = dict(G.out_degree())

# Combine in-degree and out-degree for each node
degree_info = {
    node: {"in_degree": in_degree_dict.get(node, 0), "out_degree": out_degree_dict.get(node, 0)}
    for node in G.nodes()
}

# Sort nodes by in-degree and out-degree
top_20_in_degree = sorted(degree_info.items(), key=lambda x: x[1]["in_degree"], reverse=True)[:20]
top_20_out_degree = sorted(degree_info.items(), key=lambda x: x[1]["out_degree"], reverse=True)[:20]

# Print the top 20 nodes with in-degree and out-degree
print("Top 20 nodes by in-degree:")
for node, degrees in top_20_in_degree:
    print(f"User: {node}, In-degree: {degrees['in_degree']}")

print("\nTop 20 nodes by out-degree:")
for node, degrees in top_20_out_degree:
    print(f"User: {node}, Out-degree: {degrees['out_degree']}")


Top 20 nodes by in-degree:
User: HJ Mitchell, In-degree: 341
User: Connormah, In-degree: 326
User: Ironholds, In-degree: 294
User: Lord Roem, In-degree: 272
User: The Thing That Should Not Be, In-degree: 265
User: Everyking, In-degree: 239
User: SarekOfVulcan, In-degree: 237
User: DeltaQuad, In-degree: 227
User: GiantSnowman, In-degree: 220
User: SarahStierch, In-degree: 218
User: Σ, In-degree: 217
User: Drmies, In-degree: 208
User: Richwales, In-degree: 205
User: My76Strat, In-degree: 201
User: MZMcBride, In-degree: 199
User: Slon02, In-degree: 196
User: Theopolisme, In-degree: 189
User: Dabomb87, In-degree: 189
User: 28bytes, In-degree: 188
User: Secret, In-degree: 187

Top 20 nodes by out-degree:
User: Boing! said Zebedee, Out-degree: 274
User: Fetchcomms, Out-degree: 253
User: Fastily, Out-degree: 251
User: Ktr101, Out-degree: 249
User: Axl, Out-degree: 211
User: RP459, Out-degree: 198
User: Kudpung, Out-degree: 197
User: Minimac, Out-degree: 195
User: Mkativerata, Out-degree: 191


In [31]:
# Initialize counters
count_1 = 0
count_neg1 = 0
count_0 = 0

# Loop through the edges of the graph
for u, v, data in G.edges(data=True):
    vot = data.get('weight', 0)  # Assuming 'weight' represents the VOT attribute
    
    # Count the VOT values
    if vot == 1:
        count_1 += 1
    elif vot == -1:
        count_neg1 += 1
    elif vot == 0:
        count_0 += 1

# Print the counts
print(f"Edges with VOT = 1: {count_1}")
print(f"Edges with VOT = -1: {count_neg1}")
print(f"Edges with VOT = 0: {count_0}")


Edges with VOT = 1: 0
Edges with VOT = -1: 0
Edges with VOT = 0: 32310


In [32]:
# Initialize dictionaries to count the occurrences of VOT values for each SRC
src_0_count = {}
src_1_count = {}
src_neg1_count = {}

# Iterate through the edges of the graph
for u, v, data in G.edges(data=True):
    vot = data.get('weight', 0)  # Assuming 'weight' represents the VOT attribute
    
    # Count the occurrences based on VOT values
    if vot == 0:
        if u in src_0_count:
            src_0_count[u] += 1
        else:
            src_0_count[u] = 1
    elif vot == 1:
        if u in src_1_count:
            src_1_count[u] += 1
        else:
            src_1_count[u] = 1
    elif vot == -1:
        if u in src_neg1_count:
            src_neg1_count[u] += 1
        else:
            src_neg1_count[u] = 1

# Sort the dictionaries by the count in descending order and get the top nodes
top_0_src = sorted(src_0_count.items(), key=lambda x: x[1], reverse=True)[:10]
top_1_src = sorted(src_1_count.items(), key=lambda x: x[1], reverse=True)[:10]
top_neg1_src = sorted(src_neg1_count.items(), key=lambda x: x[1], reverse=True)[:10]

# Print the top nodes with the most VOT as SRC for 0, 1, and -1
print("Top 10 nodes with the most 0 VOT as SRC:")
for src, count in top_0_src:
    print(f"{src}: {count}")

print("\nTop 10 nodes with the most 1 VOT as SRC:")
for src, count in top_1_src:
    print(f"{src}: {count}")

print("\nTop 10 nodes with the most -1 VOT as SRC:")
for src, count in top_neg1_src:
    print(f"{src}: {count}")


Top 10 nodes with the most 0 VOT as SRC:
Boing! said Zebedee: 274
Fetchcomms: 253
Fastily: 251
Ktr101: 249
Axl: 211
RP459: 198
Kudpung: 197
Minimac: 195
Mkativerata: 191
Hokeman: 168

Top 10 nodes with the most 1 VOT as SRC:

Top 10 nodes with the most -1 VOT as SRC:


In [33]:
# Initialize dictionaries to store the count of votes per node
src_0_count = {}
src_1_count = {}
src_neg1_count = {}
src_total_count = {}

# Iterate through the edges of the graph
for u, v, data in G.edges(data=True):
    vot = data.get('weight', 0)  # Assuming 'weight' represents the VOT attribute
    
    # Track the total number of votes for each node (SRC)
    if u in src_total_count:
        src_total_count[u] += 1
    else:
        src_total_count[u] = 1
    
    # Count the occurrences based on VOT values
    if vot == 0:
        src_0_count[u] = src_0_count.get(u, 0) + 1
    elif vot == 1:
        src_1_count[u] = src_1_count.get(u, 0) + 1
    elif vot == -1:
        src_neg1_count[u] = src_neg1_count.get(u, 0) + 1

# Sort the nodes by total votes and select the top 100
top_100_voters = sorted(src_total_count.items(), key=lambda x: x[1], reverse=True)[:100]

# Now calculate the proportion of each type of vote for the top 100 voters
vote_proportions = []

for node, total_votes in top_100_voters:
    prop_0 = src_0_count.get(node, 0) / total_votes
    prop_1 = src_1_count.get(node, 0) / total_votes
    prop_neg1 = src_neg1_count.get(node, 0) / total_votes
    
    vote_proportions.append({
        'node': node,
        'total_votes': total_votes,
        '0': prop_0,
        '1': prop_1,
        '-1': prop_neg1
    })

# Rank voters within the top 100 by their proportion of votes of each type
ranked_0 = sorted(vote_proportions, key=lambda x: x['0'], reverse=True)[:10]
ranked_1 = sorted(vote_proportions, key=lambda x: x['1'], reverse=True)[:10]
ranked_neg1 = sorted(vote_proportions, key=lambda x: x['-1'], reverse=True)[:10]

# Output the results
print("Top 10 voters by proportion of '0' votes:")
for voter in ranked_0:
    print(f"Node {voter['node']}: {voter['total_votes']} total votes, {voter['0']*100:.2f}% were '0'")

print("\nTop 10 voters by proportion of '1' votes:")
for voter in ranked_1:
    print(f"Node {voter['node']}: {voter['total_votes']} total votes, {voter['1']*100:.2f}% were '1'")

print("\nTop 10 voters by proportion of '-1' votes:")
for voter in ranked_neg1:
    print(f"Node {voter['node']}: {voter['total_votes']} total votes, {voter['-1']*100:.2f}% were '-1'")


Top 10 voters by proportion of '0' votes:
Node Boing! said Zebedee: 274 total votes, 100.00% were '0'
Node Fetchcomms: 253 total votes, 100.00% were '0'
Node Fastily: 251 total votes, 100.00% were '0'
Node Ktr101: 249 total votes, 100.00% were '0'
Node Axl: 211 total votes, 100.00% were '0'
Node RP459: 198 total votes, 100.00% were '0'
Node Kudpung: 197 total votes, 100.00% were '0'
Node Minimac: 195 total votes, 100.00% were '0'
Node Mkativerata: 191 total votes, 100.00% were '0'
Node Hokeman: 168 total votes, 100.00% were '0'

Top 10 voters by proportion of '1' votes:
Node Boing! said Zebedee: 274 total votes, 0.00% were '1'
Node Fetchcomms: 253 total votes, 0.00% were '1'
Node Fastily: 251 total votes, 0.00% were '1'
Node Ktr101: 249 total votes, 0.00% were '1'
Node Axl: 211 total votes, 0.00% were '1'
Node RP459: 198 total votes, 0.00% were '1'
Node Kudpung: 197 total votes, 0.00% were '1'
Node Minimac: 195 total votes, 0.00% were '1'
Node Mkativerata: 191 total votes, 0.00% were '

In [34]:
# Filter rows where the target ('TGT') is "Neelix"
votes_on_neelix = df_cleaned[df_cleaned['TGT'].str.contains("Neelix", case=False, na=False)]

# Count the types of votes based on the 'VOT' column
vote_counts = votes_on_neelix['VOT'].value_counts()

# Display the result
print("Vote types for Neelix:")
print(vote_counts)


Vote types for Neelix:
VOT
 1    69
-1    14
 0    12
Name: count, dtype: int64


### Centrality Measures

In [36]:
G = create_signed_graph(df_cleaned)

def prepare_graph_for_gephi(G):
    """Prepares the graph by ensuring all attributes are strings for Gephi compatibility."""
    for u, v, key, data in G.edges(data=True, keys=True):
        for attr, value in data.items():
            if not isinstance(value, str):
                G[u][v][key][attr] = str(value)  # Convert non-strings to strings
    return G

# Prepare the graph
G_prepared = prepare_graph_for_gephi(G)

# Export to GEXF
output_file = "rfawiki_graph.gexf"
nx.write_gexf(G_prepared, output_file)
print(f"Graph exported to {output_file}")


Graph exported to rfawiki_graph.gexf
