## Dummy example - Working smoothly

In [None]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=769217ee1e83a63c3426292f12302f40ce9c0432f83a132d69ac072aacc4814d
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [None]:
!pip install graphframes

Collecting graphframes
  Downloading graphframes-0.6-py2.py3-none-any.whl (18 kB)
Collecting nose (from graphframes)
  Downloading nose-1.3.7-py3-none-any.whl (154 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.7/154.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: nose, graphframes
Successfully installed graphframes-0.6 nose-1.3.7


In [None]:
from pyspark.sql import SparkSession
from graphframes import GraphFrame

# Create SparkSession
spark = SparkSession.builder \
    .appName("GraphFramesExample") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

In [None]:
try:
    # Create a sample GraphFrame
    vertices = spark.createDataFrame([
        ("A", "Alice", 34),
        ("B", "Bob", 36),
        ("C", "Charlie", 30),
    ], ["id", "name", "age"])

    edges = spark.createDataFrame([
        ("A", "B", "friend"),
        ("B", "C", "follow"),
        ("C", "A", "friend"),
    ], ["src", "dst", "relationship"])

    graph = GraphFrame(vertices, edges)

    # Print vertices and edges to verify they are correctly loaded
    print("Vertices:")
    graph.vertices.show()

    print("Edges:")
    graph.edges.show()

    # Query: Get all the vertices where age > 30
    filtered_vertices = graph.vertices.filter("age > 30")
    print("Vertices with age > 30:")
    filtered_vertices.show()

except Exception as e:
    print("Error:", e)

finally:
    # Stop SparkSession
    spark.stop()



Vertices:
+---+-------+---+
| id|   name|age|
+---+-------+---+
|  A|  Alice| 34|
|  B|    Bob| 36|
|  C|Charlie| 30|
+---+-------+---+

Edges:
+---+---+------------+
|src|dst|relationship|
+---+---+------------+
|  A|  B|      friend|
|  B|  C|      follow|
|  C|  A|      friend|
+---+---+------------+

Vertices with age > 30:
+---+-----+---+
| id| name|age|
+---+-----+---+
|  A|Alice| 34|
|  B|  Bob| 36|
+---+-----+---+



In [None]:
## Trying out something new

## Cliques - K:3

In [None]:
!pip3 install findspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Installing collected packages: findspark
Successfully installed findspark-2.0.1


In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from graphframes import GraphFrame

# Initialize Spark session with GraphFrames package
spark = SparkSession.builder \
    .appName("FindCliques") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

# Load the edgelist CSV file into a DataFrame
edgelist_df = spark.read.csv("/content/weighted_edges.csv", header=True, inferSchema=True)

# # Assume the CSV has columns 'Source', 'Target', and 'Weight'
vertices_df = edgelist_df.select(col("Source").alias("id")).union(edgelist_df.select(col("Target").alias("id"))).distinct()
edges_df = edgelist_df.select(col("Source").alias("src"), col("Target").alias("dst"), col("Weight"))

# # Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)

# Find cliques (using maximal clique algorithm from GraphFrames)
# This example will find 3-cliques (triangles)
cliques = g.find("(a)-[]->(b); (a)-[]->(c); (b)-[]->(c)")

# Show the cliques
cliques.show()



+-----+------+------+
|    a|     b|     c|
+-----+------+------+
|{148}|{2876}|{5127}|
|{148}|{2876}|{3890}|
|{148}|{2876}|{4998}|
|{148}|{2876}|{3614}|
|{148}|{2876}|{3848}|
|{148}|{2876}|{4145}|
|{148}|{3890}|{5127}|
|{148}|{3890}|{4145}|
|{148}|{3890}|{4022}|
|{148}|{2054}|{5127}|
|{148}|{2054}|{2876}|
|{148}|{2054}|{3890}|
|{148}|{2054}|{4998}|
|{148}|{2054}|{2537}|
|{148}|{2054}|{3614}|
|{148}|{2054}|{3848}|
|{148}|{2054}|{2813}|
|{148}|{2054}|{4145}|
|{148}|{2054}|{2308}|
|{148}|{2054}|{4022}|
+-----+------+------+
only showing top 20 rows



In [None]:
cliques.count()

8867535

## Cliques - K:4

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from graphframes import GraphFrame

# Initialize Spark session with GraphFrames package
spark = SparkSession.builder \
    .appName("FindCliques") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

# Load the edgelist CSV file into a DataFrame
edgelist_df = spark.read.csv("/content/weighted_edges.csv", header=True, inferSchema=True)

# # Assume the CSV has columns 'Source', 'Target', and 'Weight'
vertices_df = edgelist_df.select(col("Source").alias("id")).union(edgelist_df.select(col("Target").alias("id"))).distinct()
edges_df = edgelist_df.select(col("Source").alias("src"), col("Target").alias("dst"), col("Weight"))

# # Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)

# Find cliques (using maximal clique algorithm from GraphFrames)
# This example will find 4-cliques (triangles)
cliques = g.find("(a)-[]->(b); (a)-[]->(c); (a)-[]->(d); (b)-[]->(c); (b)-[]->(d); (c)-[]->(d)")

# Show the cliques
cliques.show()

+-----+------+------+------+
|    a|     b|     c|     d|
+-----+------+------+------+
|{148}|{2876}|{3890}|{5127}|
|{148}|{2876}|{3890}|{4145}|
|{148}|{2876}|{3614}|{3890}|
|{148}|{2876}|{3614}|{4998}|
|{148}|{2876}|{3614}|{3848}|
|{148}|{2876}|{3614}|{4145}|
|{148}|{2876}|{3848}|{5127}|
|{148}|{2876}|{3848}|{3890}|
|{148}|{2876}|{3848}|{4998}|
|{148}|{2876}|{3848}|{3848}|
|{148}|{2876}|{3848}|{4145}|
|{148}|{2876}|{4145}|{4998}|
|{148}|{2876}|{4145}|{4145}|
|{148}|{3890}|{4145}|{4145}|
|{148}|{3890}|{4022}|{4145}|
|{148}|{2054}|{2876}|{5127}|
|{148}|{2054}|{2876}|{3890}|
|{148}|{2054}|{2876}|{4998}|
|{148}|{2054}|{2876}|{3614}|
|{148}|{2054}|{2876}|{3848}|
+-----+------+------+------+
only showing top 20 rows



In [None]:
cliques.count()

##  Clique



In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from graphframes import GraphFrame

# Initialize Spark session with GraphFrames package
spark = SparkSession.builder \
    .appName("FindCliques") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.2-spark3.0-s_2.12") \
    .getOrCreate()

# Load the edgelist CSV file into a DataFrame
edgelist_df = spark.read.csv("/content/weighted_edges.csv", header=True, inferSchema=True)

# # Assume the CSV has columns 'Source', 'Target', and 'Weight'
vertices_df = edgelist_df.select(col("Source").alias("id")).union(edgelist_df.select(col("Target").alias("id"))).distinct()
edges_df = edgelist_df.select(col("Source").alias("src"), col("Target").alias("dst"), col("Weight"))

# # Create a GraphFrame
g = GraphFrame(vertices_df, edges_df)

# Find cliques (using maximal clique algorithm from GraphFrames)
# This example will find 3-cliques (triangles)
cliques = g.find("(a)-[]->(b); (a)-[]->(c); (b)-[]->(c)")

# Show the cliques
cliques.show()



+-----+------+------+
|    a|     b|     c|
+-----+------+------+
|{148}|{2876}|{5127}|
|{148}|{2876}|{3890}|
|{148}|{2876}|{4998}|
|{148}|{2876}|{3614}|
|{148}|{2876}|{3848}|
|{148}|{2876}|{4145}|
|{148}|{3890}|{5127}|
|{148}|{3890}|{4145}|
|{148}|{3890}|{4022}|
|{148}|{2054}|{5127}|
|{148}|{2054}|{2876}|
|{148}|{2054}|{3890}|
|{148}|{2054}|{4998}|
|{148}|{2054}|{2537}|
|{148}|{2054}|{3614}|
|{148}|{2054}|{3848}|
|{148}|{2054}|{2813}|
|{148}|{2054}|{4145}|
|{148}|{2054}|{2308}|
|{148}|{2054}|{4022}|
+-----+------+------+
only showing top 20 rows



## Label Propagation

In [None]:
communities = g.labelPropagation(maxIter=1)



In [None]:
communities.count()

5255

In [None]:
communities.persist().show(10)

+----+-----+
|  id|label|
+----+-----+
|3558|  182|
|1084| 3322|
|4685| 2766|
|4904| 1160|
|3702| 3044|
|4551| 1115|
|3007| 4551|
| 667| 3026|
|1053| 3753|
|1894|  182|
+----+-----+
only showing top 10 rows



In [None]:
selected_label = 182

same_community = communities.filter(communities.label == selected_label)

# Displaying the nodes in the same community
same_community.count()

59

In [None]:
same_community.show(10)

NameError: name 'same_community' is not defined