<a href="https://colab.research.google.com/github/honyango/test/blob/main/DISTRIBUTION_MAPPING_REDCUCING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Install Java, Spark, and FindSpark
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz -O spark-3.3.2-bin-hadoop3.tgz

# Verify downloaded file size
import os
file_size = os.path.getsize('spark-3.3.2-bin-hadoop3.tgz')
print(f"Downloaded file size: {file_size} bytes")
# Expected size for spark-3.3.2-bin-hadoop3.tgz is around 250-300 MB

!tar xf spark-3.3.2-bin-hadoop3.tgz
!pip install -q findspark

# Set environment variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

import findspark
findspark.init()

Downloaded file size: 299360284 bytes


In [3]:
from pyspark.sql import SparkSession

# Create a SparkSession with increased memory
spark = (SparkSession.builder
    .appName("ColabSparkSetup")
    .config("spark.driver.memory", "12g") # Further increase driver memory
    .config("spark.executor.memory", "12g") # Further increase executor memory
    .getOrCreate()
)

print("Spark session created successfully!")
spark

Spark session created successfully!


In [4]:
# Upload manually
from google.colab import files
uploaded = files.upload()

# Example: read CSV into Spark DataFrame
df = spark.read.csv("linkedin_job_postings.csv", header=True, inferSchema=True)
df.show(5)

Saving linkedin_job_postings.csv to linkedin_job_postings (1).csv
+--------------------+--------------------+-----------+-------+---------------+--------------------+--------------------+--------------------+----------+-----------+--------------+--------------------+----------+--------+
|            job_link| last_processed_time|got_summary|got_ner|is_being_worked|           job_title|             company|        job_location|first_seen|search_city|search_country|     search_position| job_level|job_type|
+--------------------+--------------------+-----------+-------+---------------+--------------------+--------------------+--------------------+----------+-----------+--------------+--------------------+----------+--------+
|https://www.linke...|2024-01-21 07:12:...|          t|      t|              f|Account Executive...|                  BD|       San Diego, CA|2024-01-15|   Coronado| United States|         Color Maker|Mid senior|  Onsite|
|https://www.linke...|2024-01-21 07:39:...|   

In [None]:
DATA DISTRIBUTION

In [5]:
import pandas as pd
import os

# Load the dataset
df = pd.read_csv("linkedin_job_postings.csv")

# Simulate distributed system by splitting the data into chunks (e.g., 4 chunks)
num_chunks = 4
chunk_size = len(df) // num_chunks

# Create directory to hold chunks
os.makedirs("data_chunks", exist_ok=True)

# Save chunks to simulate external distributed files
for i in range(num_chunks):
    start = i * chunk_size
    end = None if i == num_chunks - 1 else (i + 1) * chunk_size
    df.iloc[start:end].to_csv(f"data_chunks/chunk_{i}.csv", index=False)


MAPPING

In [9]:
def mapper(filename):
    """
    Reads a CSV chunk file and returns a dictionary with word counts from job titles.
    """
    import collections
    local_counter = collections.Counter()

    df = pd.read_csv(filename)

    for title in df['job_title'].dropna():
        words = title.lower().split()
        for word in words:
            local_counter[word] += 1

    return local_counter

REDUCING

In [7]:
def reducer(counters):
    """
    Combines a list of Counters (partial word counts) into a single global counter.
    """
    from collections import Counter
    global_counter = Counter()

    for counter in counters:
        global_counter.update(counter)

    return global_counter


MAP AND REDUCE ON CHUNKS

In [10]:
# Apply the mapper to each chunk
map_outputs = []

for i in range(num_chunks):
    chunk_file = f"data_chunks/chunk_{i}.csv"
    map_output = mapper(chunk_file)
    map_outputs.append(map_output)

# Apply the reducer to combine all the map results
final_result = reducer(map_outputs)

# Show top 10 most common words in job titles
print("Top 10 most common words in job titles:")
for word, count in final_result.most_common(10):
    print(f"{word}: {count}")


Top 10 most common words in job titles:
-: 438255
manager: 236887
nurse: 107375
senior: 97811
engineer: 73740
assistant: 71064
rn: 66192
registered: 65121
supervisor: 58468
sales: 55536


In [11]:
# Apply the mapper to each chunk
map_outputs = []

for i in range(num_chunks):
    chunk_file = f"data_chunks/chunk_{i}.csv"
    map_output = mapper(chunk_file)
    map_outputs.append(map_output)

# Apply the reducer to combine all the map results
final_result = reducer(map_outputs)

# Show top 10 most common words in job titles
print("Top 10 most common words in job titles:")
for word, count in final_result.most_common(10):
    print(f"{word}: {count}")

Top 10 most common words in job titles:
-: 438255
manager: 236887
nurse: 107375
senior: 97811
engineer: 73740
assistant: 71064
rn: 66192
registered: 65121
supervisor: 58468
sales: 55536
