In [0]:
from pyspark.sql.types import *
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
import pandas as pd
from pyspark.ml.feature import CountVectorizer, Tokenizer, StringIndexer, VectorAssembler, OneHotEncoder, Word2Vec, HashingTF, IndexToString
from pyspark.ml.linalg import SparseVector, Vectors, DenseVector
import numpy as np
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql.window import Window
import math

spark = SparkSession.builder.getOrCreate()

In [0]:
profiles = spark.read.parquet('/dbfs/linkedin_people_train_data')
companies = spark.read.parquet('/dbfs/linkedin_train_data')

#### Job Titles and Locations

In [0]:
jobs = profiles.select('name', 'id', 'city', 'country_code', f.col('current_company').getField('name').alias('company_name'), f.col('experience')[0].getField('title').alias('job_title'), 'position')

In [0]:
jobs.display(5)

####Clustering job titles into meta job titles

In [0]:
# Create a DataFrame with the specified centroids
centroids_data = [
    ('Leadership',), ('Product',), ('Engineering',), ('DataScience',), ('Operations',),
    ('Marketing',), ('Sales',), ('Design',), ('Support',), ('Finance',),
    ('Resources',), ('Research',), ('Healthcare',), ('Education',), ('Security',),
    ('Logistics',), ('Legal',), ('Quality',), ('Management',), ('Content',)
]

centroids_df = spark.createDataFrame(centroids_data, ['processed_title'])

# Preprocess job titles
job_titles_df = jobs.select(
    f.when(f.col('job_title').isNotNull(), f.lower(f.col('job_title')))
    .otherwise(f.lower(f.col('position')))
    .alias('processed_title')
)

job_titles_df = job_titles_df.dropna()
tokenizer = Tokenizer(inputCol="processed_title", outputCol="tokened_title")
w2v = Word2Vec(inputCol="tokened_title", outputCol="vector", vectorSize=200, minCount=1)

# Build the pipeline
pipeline = Pipeline(stages=[tokenizer, w2v])

# Train the pipeline model
model_vectorize = pipeline.fit(job_titles_df)

# Create embeddings for job titles and centroids
jobs_with_vectors = model_vectorize.transform(job_titles_df)
centroids_with_vectors = model_vectorize.transform(centroids_df)

jobs_temp = jobs_with_vectors.withColumnRenamed('vector', 'job_vector')
jobs_temp = jobs_temp.withColumnRenamed('processed_title', 'job_title')

centroids_temp = centroids_with_vectors.withColumnRenamed('processed_title', 'meta_job')
centroids_temp = centroids_temp.withColumnRenamed('vector', 'centroid_vector')

joined = jobs_temp.join(centroids_temp)

In [0]:
# Define a function to calculate cosine similarity
def cosine_similarity(v1, v2):
    if v1 is None or v2 is None:
        return None
    dot_product = float(v1.dot(v2))  # Dot product of the two vectors
    norm_v1 = math.sqrt(v1.dot(v1))  # Magnitude (norm) of v1
    norm_v2 = math.sqrt(v2.dot(v2))  # Magnitude (norm) of v2
    if norm_v1 == 0 or norm_v2 == 0:
        return None  # Avoid division by zero
    return dot_product / (norm_v1 * norm_v2)

# Register the function as a UDF
cosine_similarity_udf = f.udf(cosine_similarity, StringType())

# Add a new column to compute cosine similarity
joined = joined.withColumn(
    "cosine_similarity",
    cosine_similarity_udf(f.col("job_vector"), f.col("centroid_vector"))
)

In [0]:
# Show the result
joined.display()

In [0]:
window_spec = Window.partitionBy("job_title").orderBy(f.col("cosine_similarity").desc())

# Rank centroids for each job and select the closest one
ranked_df = joined.withColumn("rank", f.row_number().over(window_spec))

# Filter for the closest centroid
closest_centroids = ranked_df.filter(f.col("rank") == 1)

# Select relevant columns
result_df = closest_centroids.select(
    f.col("job_title"),
    f.col("meta_job").alias("closest_centroid"),
    f.col("cosine_similarity")
)

In [0]:
# Display the result
result_df.display()

In [0]:
profiles_with_state = profiles.withColumn(
    "state",
    f.split(f.col("city"), ", ")[1]  # The second element is the state
)

In [0]:
# Show the results
states_df = profiles_with_state.select("state").dropDuplicates().dropna()
states_df.display()

In [0]:
profiles_with_state = profiles.withColumn('state', f.split(f.col('city'), ', ')[1])
profiles_with_count = profiles_with_state

In [0]:
result_df.write.mode("overwrite").parquet("/Workspace/Users/lihi.kaspi@campus.technion.ac.il/classified_jobs.parquet")

## companies

In [0]:
meta_industries_12 = {
    'Furniture and Home Furnishings Manufacturing': 'Manufacturing',
    'Investment Banking': 'Financial and Investment',
    'Architecture and Planning': 'Services',
    'Wholesale': 'Services',
    'Travel Arrangements': 'Services',
    'Ranching': 'Miscellaneous',
    'Hospitals and Health Care': 'Healthcare and Medical',
    'Book and Periodical Publishing': 'Services',
    'Printing Services': 'Services',
    'Professional Training and Coaching': 'Services',
    'Computers and Electronics Manufacturing': 'Manufacturing',
    'Shipbuilding': 'Manufacturing',
    'Public Policy Offices': 'Government and Public Policy',
    'Software Development': 'Technology',
    'Outsourcing and Offshoring Consulting': 'Services',
    'Retail Groceries': 'Retail and Consumer Goods',
    'Education Administration Programs': 'Education and Training',
    'Plastics Manufacturing': 'Manufacturing',
    'Renewable Energy Semiconductor Manufacturing': 'Manufacturing',
    'Computer Networking Products': 'Technology',
    'Events Services': 'Services',
    'Information Services': 'Services',
    'Food and Beverage Services': 'Services',
    'Semiconductor Manufacturing': 'Manufacturing',
    'Business Consulting and Services': 'Services',
    'Insurance': 'Services',
    'Financial Services': 'Services',
    'Wireless Services': 'Services',
    'Computer Hardware Manufacturing': 'Technology',
    'Public Safety': 'Services',
    'Maritime Transportation': 'Transportation and Logistics',
    'Tobacco Manufacturing': 'Manufacturing',
    'Writing and Editing': 'Services',
    'Veterinary Services': 'Services',
    'Staffing and Recruiting': 'Services',
    'Accounting': 'Services',
    'International Affairs': 'Government and Public Policy',
    'Spectator Sports': 'Miscellaneous',
    'Glass, Ceramics and Concrete Manufacturing': 'Manufacturing',
    'Chemical Manufacturing': 'Manufacturing',
    'Mining': 'Miscellaneous',
    'E-Learning Providers': 'Technology',
    'Security and Investigations': 'Services',
    'Translation and Localization': 'Services',
    'Automation Machinery Manufacturing': 'Technology',
    'Computer and Network Security': 'Technology',
    'Political Organizations': 'Government and Public Policy',
    'Environmental Services': 'Government and Public Policy',
    'Oil and Gas': 'Miscellaneous',
    'Real Estate': 'Real Estate and Construction',
    'Think Tanks': 'Government and Public Policy',
    'Executive Offices': 'Miscellaneous',
    'Law Practice': 'Services',
    'Nanotechnology Research': 'Miscellaneous',
    'International Trade and Development': 'Government and Public Policy',
    'Personal Care Product Manufacturing': 'Manufacturing',
    'Philanthropic Fundraising Services': 'Services',
    'Entertainment Providers': 'Media and Entertainment',
    'Market Research': 'Media and Entertainment',
    'Movies, Videos, and Sound': 'Media and Entertainment',
    'Sporting Goods Manufacturing': 'Manufacturing',
    'Graphic Design': 'Services',
    'Technology, Information and Internet': 'Technology',
    'IT Services and IT Consulting': 'Technology',
    'Retail Office Equipment': 'Retail and Consumer Goods',
    'Wholesale Import and Export': 'Services',
    'Capital Markets': 'Financial and Investment',
    'Law Enforcement': 'Services',
    'Freight and Package Transportation': 'Transportation and Logistics',
    'Industrial Machinery Manufacturing': 'Manufacturing',
    'Non-profit Organizations': 'Miscellaneous',
    'Retail Art Supplies': 'Retail and Consumer Goods',
    'Animation and Post-production': 'Media and Entertainment',
    'Transportation, Logistics, Supply Chain and Storage': 'Transportation and Logistics',
    'Aviation and Aerospace Component Manufacturing': 'Transportation and Logistics',
    'Fundraising': 'Financial and Investment',
    'Railroad Equipment Manufacturing': 'Transportation and Logistics',
    'Construction': 'Real Estate and Construction',
    'Investment Management': 'Financial and Investment',
    'Utilities': 'Miscellaneous',
    'Retail Luxury Goods and Jewelry': 'Retail and Consumer Goods',
    'Warehousing and Storage': 'Transportation and Logistics',
    'Media Production': 'Media and Entertainment',
    'Gambling Facilities and Casinos': 'Media and Entertainment',
    'Defense and Space Manufacturing': 'Manufacturing',
    'Facilities Services': 'Services',
    'Government Relations Services': 'Government and Public Policy',
    'Advertising Services': 'Media and Entertainment',
    'Paper and Forest Product Manufacturing': 'Manufacturing',
    'Packaging and Containers Manufacturing': 'Manufacturing',
    'Telecommunications': 'Technology',
    'Medical Equipment Manufacturing': 'Healthcare and Medical',
    'Beverage Manufacturing': 'Manufacturing',
    'Restaurants': 'Retail and Consumer Goods',
    'Leasing Non-residential Real Estate': 'Real Estate and Construction',
    'Newspaper Publishing': 'Media and Entertainment',
    'Armed Forces': 'Miscellaneous',
    'Appliances, Electrical, and Electronics Manufacturing': 'Manufacturing',
    'Hospitality': 'Services',
    'Pharmaceutical Manufacturing': 'Healthcare and Medical',
    'Research Services': 'Services',
    'Retail Apparel and Fashion': 'Retail and Consumer Goods',
    'Photography': 'Media and Entertainment',
    'Wellness and Fitness Services': 'Services',
    'Truck Transportation': 'Transportation and Logistics',
    'Consumer Services': 'Services',
    'Wholesale Building Materials': 'Services',
    'Human Resources Services': 'Services',
    'Airlines and Aviation': 'Transportation and Logistics',
    'Machinery Manufacturing': 'Manufacturing',
    'Individual and Family Services': 'Services',
    'Motor Vehicle Manufacturing': 'Manufacturing',
    'Performing Arts': 'Media and Entertainment',
    'Museums, Historical Sites, and Zoos': 'Media and Entertainment',
    'Broadcast Media Production and Distribution': 'Media and Entertainment',
    'Banking': 'Financial and Investment',
    'Recreational Facilities': 'Miscellaneous',
    'Government Administration': 'Government and Public Policy',
    'Public Relations and Communications Services': 'Media and Entertainment',
    'Fisheries': 'Miscellaneous',
    'Medical Practices': 'Healthcare and Medical',
    'Religious Institutions': 'Miscellaneous',
    'Online Audio and Video Media': 'Media and Entertainment',
    'Artists and Writers': 'Miscellaneous',
    'Biotechnology Research': 'Healthcare and Medical',
    'Legal Services': 'Services',
    'Retail': 'Retail and Consumer Goods',
    'Civil Engineering': 'Services',
    'Libraries': 'Miscellaneous',
    'Alternative Dispute Resolution': 'Miscellaneous',
    'Manufacturing': 'Miscellaneous',
    'Design Services': 'Services',
    'Dairy Product Manufacturing': 'Manufacturing',
    'Higher Education': 'Education and Training',
    'Civic and Social Organizations': 'Miscellaneous',
    'Textile Manufacturing': 'Manufacturing',
    'Venture Capital and Private Equity Principals': 'Financial and Investment',
    'Mental Health Care': 'Healthcare and Medical',
    'Musicians': 'Media and Entertainment',
    'Farming': 'Miscellaneous',
    'Computer Games': 'Media and Entertainment',
    'Strategic Management Services': 'Services',
    'Food and Beverage Manufacturing': 'Manufacturing',
    'Primary and Secondary Education': 'Education and Training',
    'Alternative Medicine': 'Healthcare and Medical',
    'Legislative Offices': 'Services',
    'Administration of Justice': 'Services',
    'Mobile Gaming Apps': 'Media and Entertainment'
}

In [0]:
meta_industry = f.udf( lambda x: meta_industries_12[x] )
companies = companies.filter(companies.industries.isNotNull())
companies = companies.withColumn('meta_industry', meta_industry(f.col('industries')))

In [0]:
print(companies.count())

In [0]:
companies.select("industries", "meta_industry").display()