In [3]:
import pandas as pd
import numpy as np
import networkx as nx

import spacy
from spacy import displacy

import matplotlib.pyplot as plt


### Preprocessing

In [8]:
file_path = 'Data/ground/wiki-RfA.txt'

output_file = 'Data/wiki_RfA_2010_2013.csv'


def process_file_to_dataframe(file_path):
    # Read the text file as lines
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split the content by empty lines
    entries = content.strip().split('\n\n')

    # List to store the entries in a structured format
    data = []

    # Process each entry
    for entry in entries:
        entry_dict = {}
        for line in entry.split('\n'):
            if ":" in line:
                key, value = line.split(":", 1)
                entry_dict[key.strip()] = value.strip()
        data.append(entry_dict)

    # Convert the list of dictionaries to a DataFrame
    return pd.DataFrame(data)

# Process and filter the data by year (2010 to 2013)
def filter_data_by_year(df):
    # Convert 'YEA' column to numeric, errors='coerce' will convert non-numeric to NaN
    df['YEA'] = pd.to_numeric(df['YEA'], errors='coerce')

    # Filter the DataFrame for years between 2010 and 2013
    filtered_df = df[(df['YEA'] >= 2008) & (df['YEA'] <= 2013)]

    return filtered_df

# Write the filtered data to a CSV file
def write_csv_file(df, output_file):
    df.to_csv(output_file, index=False, encoding='utf-8')


In [2]:
# Main processing pipeline
df = process_file_to_dataframe(file_path)  # Process the file into a DataFrame
filtered_df = filter_data_by_year(df)      # Filter data for 2010-2013
write_csv_file(filtered_df, output_file)   # Write the filtered data to CSV

print(f"CSV file with data from 2010 to 2013 has been written to: {output_file}")

NameError: name 'pd' is not defined

In [46]:
rfaSet = 'Data/wiki_RfA_2010_2013.csv'


df = pd.read_csv(rfaSet)
print("Columns in DataFrame:", df.columns)


Columns in DataFrame: Index(['SRC', 'TGT', 'VOT', 'RES', 'YEA', 'DAT', 'TXT'], dtype='object')


In [47]:
df

Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,"23:13, 19 April 2013",'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,"01:04, 20 April 2013",'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,"23:43, 19 April 2013",'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,"00:11, 20 April 2013",'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,"00:56, 20 April 2013","'''Support''', with great pleasure. I work wit..."
...,...,...,...,...,...,...,...
32567,Atama,ZooPro,-1,-1,2010,"18:17, 22 February 2010","'''Oppose''' - Per Polargeo, and per [http://e..."
32568,Bradjamesbrown,ZooPro,-1,-1,2010,"18:18, 22 February 2010",'''Oppose''' per SilkTork's diff above. Assert...
32569,Ottawa4ever,ZooPro,0,-1,2010,"18:11, 22 February 2010","'''Neutral''' Not to pile on, neutral. I canno..."
32570,Tryptofish,ZooPro,0,-1,2010,"17:58, 22 February 2010",'''Neutral''' I've interacted with this editor...


#### Treating time DAT

removing edges with not time 

In [48]:
# Convert the 'DAT' column to datetime, invalid parsing will result in NaT
df['DAT'] = pd.to_datetime(df['DAT'], format='%H:%M, %d %B %Y', errors='coerce')

# Filter rows where 'DAT' is NaT
nat_rows = df[df['DAT'].isna()]

# Display the rows with NaT in the 'DAT' column
print(nat_rows)


               SRC                TGT  VOT  RES   YEA DAT  \
707            NaN        Jason Quinn    0    1  2013 NaT   
708            NaN        Jason Quinn    0    1  2013 NaT   
793            NaN            Legoktm    1    1  2013 NaT   
969    Majoreditor          Lord Roem    1    1  2013 NaT   
1126           NaN      Mattythewhite   -1    1  2013 NaT   
...            ...                ...  ...  ...   ...  ..   
32310      Davidwr           Venomcuz   -1   -1  2010 NaT   
32346       Begoon      White Shadows   -1   -1  2010 NaT   
32350          NaN      White Shadows   -1   -1  2010 NaT   
32394          NaN         WikiCopter   -1   -1  2010 NaT   
32522          NaN  William S. Saturn   -1   -1  2010 NaT   

                                                     TXT  
707                                                  NaN  
708                                                  NaN  
793                                                  NaN  
969    '''Support'''. The candi

out of 32522 we found 262 edges with invalid/ NaN time - removing them for a better analysis and inegration of the network

#### removing those DAT timeless edges - RUN ONLY ONCE

In [49]:

# Read the CSV file
df = pd.read_csv(rfaSet)

# Convert 'DAT' to datetime, coercing errors to NaT (invalid datetimes)
df['DAT'] = pd.to_datetime(df['DAT'], format='%H:%M, %d %B %Y', errors='coerce')

na_count = df['DAT'].isna().sum()
print(f"Number of rows with NA in DAT: {na_count}")

# Remove rows where DAT is NaT (missing or invalid datetime)
df_cleaned = df.dropna(subset=['DAT'])


na_count = df_cleaned['DAT'].isna().sum()
print(f"Number of rows with NA in DAT: {na_count}")

# Print the cleaned dataframe
write_csv_file(df_cleaned, output_file)  

#preprocessing DONE


Number of rows with NA in DAT: 262
Number of rows with NA in DAT: 0


##### PRE PROCESSING DONE - JUST USE THE CLEANED CVS FILE FROM NOW ON

In [50]:
df = pd.read_csv("./Data/wiki_RfA_2010_2013.csv")
df

Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,2013-04-19 23:13:00,'''Support''' as co-nom.
1,Cuchullain,BDD,1,1,2013,2013-04-20 01:04:00,'''Support''' as nominator.--
2,INeverCry,BDD,1,1,2013,2013-04-19 23:43:00,'''Support''' per noms.
3,Cncmaster,BDD,1,1,2013,2013-04-20 00:11:00,'''Support''' per noms. BDD is a strong contri...
4,Miniapolis,BDD,1,1,2013,2013-04-20 00:56:00,"'''Support''', with great pleasure. I work wit..."
...,...,...,...,...,...,...,...
32305,Atama,ZooPro,-1,-1,2010,2010-02-22 18:17:00,"'''Oppose''' - Per Polargeo, and per [http://e..."
32306,Bradjamesbrown,ZooPro,-1,-1,2010,2010-02-22 18:18:00,'''Oppose''' per SilkTork's diff above. Assert...
32307,Ottawa4ever,ZooPro,0,-1,2010,2010-02-22 18:11:00,"'''Neutral''' Not to pile on, neutral. I canno..."
32308,Tryptofish,ZooPro,0,-1,2010,2010-02-22 17:58:00,'''Neutral''' I've interacted with this editor...


In [52]:
# Remove 'Support', 'Oppose', and 'Neutral' from the 'TXT' column
df['TXT'] = df['TXT'].str.replace('Support', '', regex=False)
df['TXT'] = df['TXT'].str.replace('Oppose', '', regex=False)
df['TXT'] = df['TXT'].str.replace('Neutral', '', regex=False)

# Optionally, strip any extra spaces
df['TXT'] = df['TXT'].str.strip()

# Preview the updated DataFrame
df


Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,2013-04-19 23:13:00,'''''' as co-nom.
1,Cuchullain,BDD,1,1,2013,2013-04-20 01:04:00,'''''' as nominator.--
2,INeverCry,BDD,1,1,2013,2013-04-19 23:43:00,'''''' per noms.
3,Cncmaster,BDD,1,1,2013,2013-04-20 00:11:00,'''''' per noms. BDD is a strong contributor w...
4,Miniapolis,BDD,1,1,2013,2013-04-20 00:56:00,"'''''', with great pleasure. I work with BDD a..."
...,...,...,...,...,...,...,...
32305,Atama,ZooPro,-1,-1,2010,2010-02-22 18:17:00,"'''''' - Per Polargeo, and per [http://en.wiki..."
32306,Bradjamesbrown,ZooPro,-1,-1,2010,2010-02-22 18:18:00,'''''' per SilkTork's diff above. Asserting [[...
32307,Ottawa4ever,ZooPro,0,-1,2010,2010-02-22 18:11:00,"'''''' Not to pile on, neutral. I cannot suppo..."
32308,Tryptofish,ZooPro,0,-1,2010,2010-02-22 17:58:00,'''''' I've interacted with this editor at var...


#### Remove 0 label edges, they cant be worked that well in terms of Social Connection theories

In [55]:
# Remove rows where VOT is 0
df = df[df['VOT'] != 0]

# Reset the index to keep it clean (optional)
df.reset_index(drop=True, inplace=True)

# Confirm the change
print(f"Remaining rows: {len(df)}")

write_csv_file(df, output_file) 
df


Remaining rows: 30252


Unnamed: 0,SRC,TGT,VOT,RES,YEA,DAT,TXT
0,Steel1943,BDD,1,1,2013,2013-04-19 23:13:00,'''''' as co-nom.
1,Cuchullain,BDD,1,1,2013,2013-04-20 01:04:00,'''''' as nominator.--
2,INeverCry,BDD,1,1,2013,2013-04-19 23:43:00,'''''' per noms.
3,Cncmaster,BDD,1,1,2013,2013-04-20 00:11:00,'''''' per noms. BDD is a strong contributor w...
4,Miniapolis,BDD,1,1,2013,2013-04-20 00:56:00,"'''''', with great pleasure. I work with BDD a..."
...,...,...,...,...,...,...,...
30247,Smithers7,ZooPro,-1,-1,2010,2010-02-22 16:06:00,'''[[User:Smithers7/RfA|]]''' - ZooPro mention...
30248,SilkTork,ZooPro,-1,-1,2010,2010-02-22 17:18:00,"'''''' because of the concerns already raised,..."
30249,GlassCobra,ZooPro,-1,-1,2010,2010-02-22 18:14:00,'''''' per Tanthalas and SilkTork.
30250,Atama,ZooPro,-1,-1,2010,2010-02-22 18:17:00,"'''''' - Per Polargeo, and per [http://en.wiki..."


In [57]:

# Assuming df is the DataFrame that you've already filtered and processed
# Create a signed graph
def create_signed_graph(df):
    """
    Create a MultiDiGraph from a DataFrame with edges labeled by interaction type.

    Parameters:
        df (pd.DataFrame): DataFrame containing columns for edges and attributes.

    Returns:
        G (nx.MultiDiGraph): The created graph.
    """
    G = nx.MultiDiGraph()  # Use MultiDiGraph to allow multiple edges between nodes

    # Iterate through the rows of the DataFrame to add edges and nodes
    for _, row in df.iterrows():
        src = row['SRC']  # Source node
        tgt = row['TGT']  # Target node
        vot = row['VOT']  # Label: -1, 1, or 0
        txt = row['TXT']  # Additional textual information
        res = row['RES']  # Admin status: 1 for admin, 0 for nonAdmin
        dat = row['DAT']  # Date or timestamp attribute

        # Add nodes if they don't already exist
        if src not in G:
            G.add_node(src)
        if tgt not in G:
            G.add_node(tgt)

        # Determine admin status for the edge
        admin_status = "admin" if res == 1 else "nonAdmin"

        # Add edge with attributes, storing `VOT` as a label, not a weight
        edge_attrs = {
            'label': vot,   # Interaction type (-1, 0, 1)
            'txt': txt,     # Additional text data
            'admin': admin_status,  # Whether the interaction involves an admin
            'DAT': dat      # Timestamp or date
        }
        G.add_edge(src, tgt, **edge_attrs)  # Add the edge with all attributes
    
    return G

# Create the signed graph
G = create_signed_graph(df)

In [59]:
output_gexf_file = 'signed_graph.gexf'

nx.write_gexf(G, output_gexf_file)

In [13]:
#!/usr/bin/env python
# coding: utf-8

import logging
logging.basicConfig(level=logging.INFO, format='%(levelname)s: %(message)s')

import re
import unicodedata
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Ensure NLTK stopwords and lemmatizer resources are downloaded if needed
# import nltk
# nltk.download('stopwords')
# nltk.download('wordnet')

#-------------------------------------------------------
# Additional text cleaning configuration
#-------------------------------------------------------
stop_words = set(stopwords.words('english'))

# Add domain/project-specific stopwords
custom_stops = {
    'per', 'wp', 'notnow', 'nom', 'user', 'admin', 'candidate', 'wikipedia',
    'rfaa', 'rfa', 'en', 'http', 'org', 'com', 'www', 'index', 'content',
    'page', 'talk', 'oldid', 'diff'
}
stop_words = stop_words.union(custom_stops)

lemmatizer = WordNetLemmatizer()

def clean_text(text):
    """
    Cleans a given text string by removing wiki markup, URLs, HTML tags,
    non-alphanumeric chars, normalizing unicode, tokenizing, removing stopwords,
    lemmatizing, and converting to lowercase.
    """
    if not isinstance(text, str):
        return ""
    
    # Normalize unicode
    text = unicodedata.normalize('NFKC', text)
    
    # Remove wiki markup and links
    text = re.sub(r"'{2,}", '', text)         # remove sequences of apostrophes
    text = re.sub(r"\[\[.*?\]\]", ' ', text)  # remove double-bracket wiki links
    text = re.sub(r"\[http.*?\]", ' ', text)  # remove http links in brackets
    text = re.sub(r"http\S+", ' ', text)       # remove raw URLs
    text = re.sub(r"\{\{.*?\}\}", ' ', text)   # remove templates
    text = re.sub(r"<.*?>", ' ', text)         # remove HTML tags
    
    # Remove non-alphanumeric (except space)
    text = re.sub(r"[^a-zA-Z0-9\s]", ' ', text)
    text = text.lower().strip()

    # Tokenize
    tokens = word_tokenize(text)

    # Remove stopwords and short tokens
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]

    # Lemmatize tokens
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    return ' '.join(tokens)

#-------------------------------------------------------
# Preprocessing Functions
#-------------------------------------------------------

def process_file_to_dataframe(file_path):
    """
    Read the wiki-RfA raw text file and parse it into a structured DataFrame.
    Each RfA entry is separated by an empty line, and lines have 'KEY: VALUE'.
    """
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Split by empty lines to separate entries
    entries = content.strip().split('\n\n')

    data = []
    for entry in entries:
        entry_dict = {}
        for line in entry.split('\n'):
            if ":" in line:
                key, value = line.split(":", 1)
                entry_dict[key.strip()] = value.strip()
        data.append(entry_dict)

    df = pd.DataFrame(data)
    return df


def filter_data_by_year(df, start_year=2010, end_year=2013):
    """
    Filter the DataFrame for entries between the specified years (inclusive).
    Converts YEA to numeric and filters based on it.
    """
    df['YEA'] = pd.to_numeric(df['YEA'], errors='coerce')
    filtered_df = df[(df['YEA'] >= start_year) & (df['YEA'] <= end_year)]
    return filtered_df


def clean_datetime(df, datetime_col='DAT', datetime_format='%H:%M, %d %B %Y'):
    """
    Convert 'DAT' column to a datetime. Remove rows with invalid datetime.
    """
    df[datetime_col] = pd.to_datetime(df[datetime_col], format=datetime_format, errors='coerce')
    na_count = df[datetime_col].isna().sum()
    if na_count > 0:
        logging.warning(f"Found {na_count} rows with invalid datetime in '{datetime_col}'. Removing them.")
    df = df.dropna(subset=[datetime_col])
    return df


def remove_keywords_from_text(df, text_col='TXT', keywords=None):
    """
    Remove specified keywords from the text column. For ex: 'Support', 'Oppose', 'Neutral'.
    """
    if keywords is None:
        keywords = ['Support', 'Oppose', 'Neutral']
    for kw in keywords:
        df[text_col] = df[text_col].str.replace(kw, '', regex=False)
    df[text_col] = df[text_col].str.strip()
    return df


def remove_zero_votes(df, vote_col='VOT'):
    """
    Remove rows where 'VOT' = 0.
    """
    initial_count = len(df)
    df = df[df[vote_col] != 0]
    removed_count = initial_count - len(df)
    if removed_count > 0:
        logging.info(f"Removed {removed_count} rows with zero votes (VOT=0).")
    return df


def write_csv_file(df, output_file):
    """
    Write the DataFrame to a CSV file with UTF-8 encoding.
    """
    df.to_csv(output_file, index=False, encoding='utf-8')
    logging.info(f"CSV file successfully written to {output_file}")


#-------------------------------------------------------
# Main Execution
#-------------------------------------------------------

if __name__ == '__main__':
    file_path = 'Data/ground/wiki-RfA.txt'
    output_file = 'Data/wiki_RfA_2005_2013.csv'
    
    # Step 1: Parse raw file
    logging.info("Processing raw file into DataFrame...")
    df = process_file_to_dataframe(file_path)
    
    # Step 2: Filter by year
    logging.info("Filtering data by year (2008-2013)...")
    df = filter_data_by_year(df, 2005, 2013)
    
    # Step 3: Clean DAT column
    logging.info("Cleaning DAT column...")
    df = clean_datetime(df, datetime_col='DAT', datetime_format='%H:%M, %d %B %Y')
    
    # Step 4: Remove 'Support', 'Oppose', 'Neutral' from TXT
    logging.info("Removing specific keywords from TXT...")
    df = remove_keywords_from_text(df, text_col='TXT', keywords=['Support', 'Oppose', 'Neutral'])
    
    # Step 5: Remove zero-vote rows
    logging.info("Removing zero-vote rows...")
    df = remove_zero_votes(df, vote_col='VOT')
    
    # Step 6: Apply the advanced text cleaning to a new column 'cleaned_TXT'
    logging.info("Applying advanced text cleaning to 'TXT' column...")
    df['cleaned_TXT'] = df['TXT'].apply(clean_text)
    
    # Step 7: Write final cleaned DataFrame to CSV
    write_csv_file(df, output_file)
    
    # Optionally show a sample
    logging.info("Preprocessing complete. Here is a sample of the cleaned DataFrame:")
    logging.info(df[['TXT', 'cleaned_TXT']].head(10))


User HJ Mitchell ran in 3 different years: 2011, 2010, 2009.
User Ironholds ran in 4 different years: 2011, 2010, 2009, 2008.
User Everyking ran in 5 different years: 2010, 2009, 2008, 2007, 2006.
User SarekOfVulcan ran in 2 different years: 2011, 2008.
User The Thing That Should Not Be ran in 1 year: 2010.
User Connormah ran in 1 year: 2010.
User SarahStierch ran in 1 year: 2012.
User Lord Roem ran in 2 different years: 2013, 2012.
User Drmies ran in 1 year: 2011.
User Σ ran in 1 year: 2012.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['Year'] = user_data['DAT'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['Year'] = user_data['DAT'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  user_data['Year'] = user_data['DAT'].dt.year
A value is trying to be set on a copy of a slice from a DataFrame.
Try