# Graph Data
Performed detailed data prepping and processing on image data

In [None]:
!pip install h2o


Collecting h2o
  Downloading h2o-3.44.0.1.tar.gz (257.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m257.4/257.4 MB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: h2o
  Building wheel for h2o (setup.py) ... [?25l[?25hdone
  Created wheel for h2o: filename=h2o-3.44.0.1-py2.py3-none-any.whl size=257484150 sha256=503b68e4eb801cff2c6c79d11e697750041de76c1d7594d5987d8d9b3a20a68e
  Stored in directory: /root/.cache/pip/wheels/d9/9b/ca/7345b72d17e1e17da37239d70631c3214ec9e541b0c9e700e2
Successfully built h2o
Installing collected packages: h2o
Successfully installed h2o-3.44.0.1


## Step 1: EDA

In [1]:
import networkx as nx
import pandas as pd

# Load the edge list from the uploaded CSV file
edgelist_df = pd.read_csv('/content/edgelist.csv')

# Show the first few rows of the edge list DataFrame
edgelist_df.head()


Unnamed: 0,isForked,isTopContributor,repo_id,dev_id
0,False,True,0,0
1,False,True,1,0
2,False,True,2,0
3,False,True,3,0
4,True,,4,2


In [2]:
# Check for missing values and data types in the DataFrame
missing_values = edgelist_df.isnull().sum()
data_types = edgelist_df.dtypes

missing_values, data_types


(isForked             0
 isTopContributor    24
 repo_id              0
 dev_id               0
 dtype: int64,
 isForked              bool
 isTopContributor    object
 repo_id              int64
 dev_id               int64
 dtype: object)

# Step 2: Data Processing

In [3]:
# Impute missing values in 'isTopContributor' with the most frequent value
most_frequent_value = edgelist_df['isTopContributor'].mode()[0]
edgelist_df['isTopContributor'].fillna(most_frequent_value, inplace=True)

# Convert 'isTopContributor' to Boolean data type
edgelist_df['isTopContributor'] = edgelist_df['isTopContributor'].astype('bool')

# Check for missing values and data types again to confirm changes
missing_values_after = edgelist_df.isnull().sum()
data_types_after = edgelist_df.dtypes

missing_values_after, data_types_after


(isForked            0
 isTopContributor    0
 repo_id             0
 dev_id              0
 dtype: int64,
 isForked             bool
 isTopContributor     bool
 repo_id             int64
 dev_id              int64
 dtype: object)

## Step 3: Feature Extraction

In [5]:
# Create a graph from the edge list
G = nx.from_pandas_edgelist(edgelist_df, 'repo_id', 'dev_id', ['isForked', 'isTopContributor'])

# Calculate graph-based features
degree_centrality = nx.degree_centrality(G)
betweenness_centrality = nx.betweenness_centrality(G)
closeness_centrality = nx.closeness_centrality(G)

# Create a DataFrame for these features
features_df = pd.DataFrame({
    'Node': list(degree_centrality.keys()),
    'DegreeCentrality': list(degree_centrality.values()),
    'BetweennessCentrality': list(betweenness_centrality.values()),
    'ClosenessCentrality': list(closeness_centrality.values())
})

# Show the first few rows of the features DataFrame
features_df.head()


Unnamed: 0,Node,DegreeCentrality,BetweennessCentrality,ClosenessCentrality
0,0,0.001693,0.104013,0.155527
1,1,0.000339,0.0,0.1346
2,2,0.001016,0.103095,0.140592
3,3,0.00237,0.358064,0.174993
4,4,0.001355,0.12955,0.130249


## Step 4: Clustering & Anomaly detection

In [6]:
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import numpy as np

# Step 4: Clustering using K-means
kmeans = KMeans(n_clusters=3, random_state=42)
features_df['Cluster'] = kmeans.fit_predict(features_df[['DegreeCentrality', 'BetweennessCentrality', 'ClosenessCentrality']])

# Anomaly Detection using Isolation Forest
iso_forest = IsolationForest(contamination=0.1, random_state=42)
features_df['Anomaly'] = iso_forest.fit_predict(features_df[['DegreeCentrality', 'BetweennessCentrality', 'ClosenessCentrality']])




## Model Building & Auto ML

In [7]:
np.random.seed(42)
features_df['IsInfluential'] = np.random.choice([0, 1], size=features_df.shape[0])

# Split data
X = features_df[['DegreeCentrality', 'BetweennessCentrality', 'ClosenessCentrality', 'Cluster', 'Anomaly']]
y = features_df['IsInfluential']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Build a RandomForest Classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Show the first few rows of the DataFrame with cluster and anomaly labels, and model accuracy
features_df.head(), accuracy

(   Node  DegreeCentrality  BetweennessCentrality  ClosenessCentrality  \
 0     0          0.001693               0.104013             0.155527   
 1     1          0.000339               0.000000             0.134600   
 2     2          0.001016               0.103095             0.140592   
 3     3          0.002370               0.358064             0.174993   
 4     4          0.001355               0.129550             0.130249   
 
    Cluster  Anomaly  IsInfluential  
 0        1       -1              0  
 1        2       -1              1  
 2        1       -1              0  
 3        1       -1              0  
 4        1       -1              0  ,
 0.49915397631133673)