#  WeFarm data set - Farmer ID

This notebook processes the WeFarm data set and creates a leaderboard of the top contributors.

## Definitions

In [9]:
# Set up
from pathlib import Path

ROOT_DIR = Path('').resolve()
DATA_DIR = ROOT_DIR / 'data'
RESULTS_DIR = ROOT_DIR / 'results'
DATA_FILE = DATA_DIR / 'b0cd514b-b9cc-4972-a0c2-c91726e6d825.csv'
CSV_DELIMETER=','
QID_COL='question_id'
ATTRS=('question_language','question_topic')
FROM_COL='question_user_id'
TO_COL='response_user_id'

## Confirm directories and file

In [2]:
# Create directories if they don't exist
for directory in [DATA_DIR, RESULTS_DIR]:
    directory.mkdir(exist_ok=True)

    # print contents of data directory
    print(f"Files in '{directory}':")
    for file in directory.iterdir():
        print(f"\t{file.name}")
    print()

    print(f"Data file: {DATA_FILE}")

Files in 'C:\Users\jbhan\OneDrive\Documents\Visual Studio 2019\Projects\Python\datakit-smallholder-farmers-fall-2025\Challenge 3_Community Leaders\jayr\data':
	b0cd514b-b9cc-4972-a0c2-c91726e6d825.csv
	meeting_notes.txt
	Producers Direct DataKit Brief (external).md

Files in 'C:\Users\jbhan\OneDrive\Documents\Visual Studio 2019\Projects\Python\datakit-smallholder-farmers-fall-2025\Challenge 3_Community Leaders\jayr\results':



## Create a Multi Di Graph of the questions

In [11]:
import csv
from src.graph import MultiDiGraph

graph = MultiDiGraph()

with open(DATA_FILE, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter=CSV_DELIMETER)

    for row in reader:
        from_node = row[FROM_COL]
        to_node = row[TO_COL]

        # Remaining columns become edge attributes
        edge_id = row[QID_COL]

        graph.add_edge(from_node, to_node, edge_id)

## Confirm the graph was created

In [13]:
# Basic
edges = graph.get_all_edges()
print(len(edges))


20304843


## Get the most askers and answerers

In [None]:
in_degrees = [(node, graph.in_degree(node)) for node in graph.nodes]
nodes_by_indegree = sorted(in_degrees, key=lambda x: x[1], reverse=True)
print(nodes_by_indegree[:10])

In [None]:
out_degrees = [(node, graph.out_degree(node)) for node in graph.nodes]
nodes_by_outdegree = sorted(out_degrees, key=lambda x: x[1], reverse=True)
print(nodes_by_outdegree[:10])

## 4. Data Visualization

In [None]:
from src.visualization import setup_plot_style, plot_distributions, plot_correlation_heatmap

# Set up visualization
%matplotlib inline
setup_plot_style()

#  visualization
plot_distributions(df)
plot_correlation_heatmap(df)

## 5. Save Results

In [None]:
# Example: Save processed data
# processed_path = RESULTS_DIR / 'processed_data.csv'
# df.to_csv(processed_path, index=False)
# print(f'Saved processed data to {processed_path}')