In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# Read the data from the csv files
x_data = pd.read_csv('Data/x.csv')
edge_data = pd.read_csv('Data/edge_index.csv')

In [3]:
# Merge the two datasets based on the 'index' column
data = pd.merge(x_data, edge_data, left_on='index', right_on='index_x')
data.drop(['index_x', 'index_y'], axis=1, inplace=True)

In [4]:
# Feature engineering
# Calculate the degree of each node
degree = data['index'].value_counts().rename('degree')
data = data.merge(degree.to_frame(), left_on='index', right_index=True)

In [5]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data.drop('LABEL', axis=1), data['LABEL'], stratify=data['LABEL'], test_size=0.3, random_state=42)

In [6]:
# Train the model
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42)
rf.fit(X_train, y_train)

In [7]:
# Evaluate the model
y_pred = rf.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)

Mean Squared Error: 0.0


In [8]:
# Interpret the model
# You can analyze the feature importances to gain insights into the natural communities of Fetch users in Wisconsin
importances = pd.Series(rf.feature_importances_, index=X_train.columns)
print('Feature Importances:', importances)

Feature Importances: Unnamed: 0_x    0.0
index           0.0
Unnamed: 0_y    0.0
degree          0.0
dtype: float64
