In [1]:
import pandas as pd
import numpy as np
import networkx as nx

import spacy
from spacy import displacy

import matplotlib.pyplot as plt


In [71]:
rfaSet = 'Data/wiki_RfA_2010_2013.csv'

#### Making DAT time Objects

In [98]:

# Read the CSV file
df = pd.read_csv(rfaSet)

# Convert 'DAT' to datetime, coercing errors to NaT (invalid datetimes)
df['DAT'] = pd.to_datetime(df['DAT'], format='%H:%M, %d %B %Y', errors='coerce')

na_count = df['DAT'].isna().sum()
print(f"Number of rows with NA in DAT: {na_count}")

# Remove rows where DAT is NaT (missing or invalid datetime)
df_cleaned = df.dropna(subset=['DAT'])


na_count = df_cleaned['DAT'].isna().sum()
print(f"Number of rows with NA in DAT: {na_count}")

# Print the cleaned dataframe
df_cleaned


Number of rows with NA in DAT: 262
Number of rows with NA in DAT: 0


Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,2013-04-19 23:13:00,'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,2013-04-20 01:04:00,'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,2013-04-19 23:43:00,'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,2013-04-20 00:11:00,'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,2013-04-20 00:56:00,"'''Support''', with great pleasure. I work wit..."
...,...,...,...,...,...,...,...
32567,Atama,ZooPro,-1,-1,2010,2010-02-22 18:17:00,"'''Oppose''' - Per Polargeo, and per [http://e..."
32568,Bradjamesbrown,ZooPro,-1,-1,2010,2010-02-22 18:18:00,'''Oppose''' per SilkTork's diff above. Assert...
32569,Ottawa4ever,ZooPro,0,-1,2010,2010-02-22 18:11:00,"'''Neutral''' Not to pile on, neutral. I canno..."
32570,Tryptofish,ZooPro,0,-1,2010,2010-02-22 17:58:00,'''Neutral''' I've interacted with this editor...


In [94]:

# Assuming df is the DataFrame that you've already filtered and processed
# Create a signed graph
def create_signed_graph(df):
    G = nx.MultiDiGraph()  # Use MultiDiGraph to allow multiple edges between nodes
    
    # Iterate through the rows of the DataFrame to add edges and nodes
    for _, row in df.iterrows():
        src = row['SRC']
        tgt = row['TGT']
        vot = row['VOT']
        txt = row['TXT']  # Assuming there's a TXT column in the dataframe
        res = row['RES']  # Assuming there's a RES column in the dataframe
        dat = row['DAT']  # Assuming there's a DAT column in the dataframe
        
        # Add nodes (only storing their names/IDs, no status)
        if src not in G:
            G.add_node(src)
        if tgt not in G:
            G.add_node(tgt)
        
        # Determine admin status (based on RES value)
        admin_status = "admin" if res == 1 else "nonAdmin"
        
        # Add edge with the necessary attributes: vote weight, text, admin status, and DAT
        edge_attrs = {
            'weight': vot, 
            'txt': txt,
            'admin': admin_status,
            'DAT': dat  # Adding the DAT value to the edge attributes
        }

        G.add_edge(src, tgt, **edge_attrs)
    
    return G


# Visualize the signed graph
def visualize_graph(G):
    # Drawing the graph
    pos = nx.spring_layout(G)  # Positioning for better visualization
    plt.figure(figsize=(12, 12))
    
    # Draw nodes, edges, and labels
    nx.draw(G, pos, with_labels=True, node_color='skyblue', node_size=3000, font_size=10, font_weight='bold', edge_color='gray')
    
    # Draw edge labels for the weights
    edge_labels = nx.get_edge_attributes(G, 'weight')
    nx.draw_networkx_edge_labels(G, pos, edge_labels=edge_labels)
    
    plt.title("Signed Graph of SRC -> TGT Votes")
    plt.show()



# Create the signed graph
G = create_signed_graph(df_cleaned)

In [95]:
G.number_of_edges()

31876

### GML Format to Gephi

In [96]:
nx.write_gml(G, 'cleaned_graph.gml') 

### Graph Characteristics


In [51]:
G = create_signed_graph(df)

number of nodes

In [52]:
G.number_of_nodes()

2986

### Number of Admins and NonAdmins

In [53]:
# Count the number of nodes with 'status' as 'admin'
admin_count = sum(1 for node, data in G.nodes(data=True) if data['status'] == 'admin')

print(f"Number of admin nodes: {admin_count}")


Number of admin nodes: 2238


In [54]:
# Count the number of nodes with 'status' as 'nonAdmin'
non_admin_count = sum(1 for node, data in G.nodes(data=True) if data['status'] == 'nonAdmin')

print(f"Number of nonAdmin nodes: {non_admin_count}")



Number of nonAdmin nodes: 748


### Number of Edges

In [55]:
G.number_of_edges()

31876

In [58]:
# Get the first 10 edges' txt attributes
edge_txts = [data['txt'] for _, _, data in list(G.edges(data=True))[:10]]

# Print each txt value on a separate line
for txt in edge_txts:
    print()
    print(txt)



'''Support''' as co-nom.

<del>'''Neutral'''</del>. '''Moral Support'''. I've only run across RockMagnetist once in my travels in checking/editing articles on this Wikipedia, and RockMagnetist seems like a good candidate for this based on the description above. However, since I have to base my votes on the actual experience I have had with the user as an editor, I cannot sway my vote either way. Either way, I am leaning more towards a support, but since I vote based on interaction, I have neither any positive nor negative thoughts going through my head, so I vote "Neutral".

'''Neutral'''. As I vote based on actual interaction with an editor, since I have never interacted with this editor at all in the past, I have to vote "Neutral".

'''Oppose'''. Well, I was going to vote "Neutral" since I base my votes on interaction with the editor/user; however, after reading [[User:Reaper Eternal|Reaper Eternal]]'s opposition statement, I can definitely agree that I do not want an administrator 

In [67]:
import pandas as pd

# Assuming df is your DataFrame containing the data

# Group by 'TGT' and 'YEA' (year) and count the occurrences
target_years = df.groupby(['TGT', 'YEA']).size().reset_index(name='count')

# Now group by 'TGT' and get the list of distinct years each target appears in
target_years_list = target_years.groupby('TGT')['YEA'].apply(list)

# Filter for targets that appear in more than one year
targets_in_multiple_years = target_years_list[target_years_list.apply(len) > 1]

# Output the result
print("Targets that appear in multiple different years, with the years:")
print(targets_in_multiple_years)


Targets that appear in multiple different years, with the years:
TGT
28bytes               [2010, 2011]
Ankitbhatt            [2010, 2011]
Armbrust              [2010, 2011]
BuickCenturyDriver    [2010, 2011]
Calabe1992            [2011, 2012]
Curtis23              [2010, 2011]
DeltaQuad             [2010, 2011]
Dusti                 [2010, 2011]
Floydian              [2011, 2012]
GSorby                [2011, 2012]
GiantSnowman          [2011, 2012]
Guoguo12              [2010, 2011]
HJ Mitchell           [2010, 2011]
Hahc21                [2012, 2013]
Ironholds             [2010, 2011]
ItsZippy              [2011, 2012]
Ktr101                [2010, 2012]
Lord Roem             [2012, 2013]
Marcus Qwertyus       [2010, 2012]
My76Strat             [2011, 2012]
Rami R                [2010, 2011]
Reenem                [2010, 2011]
Rehman                [2010, 2011]
Richardcavell         [2010, 2011]
Richwales             [2010, 2011]
Secret                [2010, 2013]
Slon02               

In [70]:
import pandas as pd

# Assuming df is your DataFrame containing the data

# First, filter non-admins (RES = -1)
non_admins = df[df['RES'] == -1]

# Now filter admins (RES = 1)
admins = df[df['RES'] == 1]

# Merge both dataframes on 'SRC' to find non-admins that later become admins
merged = pd.merge(non_admins[['TGT', 'YEA']], admins[['TGT', 'YEA']], on='TGT', how='inner', suffixes=('_nonAdmin', '_admin'))

# Now, check if the admin year is later than the non-admin year
merged = merged[merged['YEA_admin'] > merged['YEA_nonAdmin']]

# Output the result
print("Non-admins that became admins in a later year:")
print(merged)


Non-admins that became admins in a later year:
              TGT  YEA_nonAdmin  YEA_admin
0       Lord Roem          2012       2013
1       Lord Roem          2012       2013
2       Lord Roem          2012       2013
3       Lord Roem          2012       2013
4       Lord Roem          2012       2013
...           ...           ...        ...
155752     Slon02          2010       2011
155753     Slon02          2010       2011
155754     Slon02          2010       2011
155755     Slon02          2010       2011
155756     Slon02          2010       2011

[101053 rows x 3 columns]
