In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)
FOLDERNAME = "Colab\ Notebooks/fetch-data"
%cd drive/MyDrive/$FOLDERNAME

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import re
import json, ast

In [None]:
FILENAME = 'merged_google_swe.csv'
SPLIT_FILE_NAME = FILENAME.split('_')
COMPANY = SPLIT_FILE_NAME[1].capitalize()
ROLE = SPLIT_FILE_NAME[2].split('.')[0]
PATH = f'raw/{COMPANY}/'
OUTPUT = f'cleaned/BERT/cleaned_bert_{COMPANY.lower()}_{ROLE}.csv'

In [None]:
LABELS = {
    'da': 0,
    'ds': 1,
    'pm': 2,
    'swe': 3
}

In [None]:
def parse_json_list(x):
    if pd.isna(x) or not isinstance(x, str) or x.strip() in ['', '[]']:
        return []
    s = x.strip()
    try:
        return json.loads(s)
    except json.JSONDecodeError:
        try:
            return ast.literal_eval(s)
        except (ValueError, SyntaxError):
            return []

df = pd.read_csv(
    (PATH+FILENAME),
    converters={
        'education'     : parse_json_list,
        'experience'    : parse_json_list,
        'certifications': parse_json_list,
        'projects'      : parse_json_list,
        'publications'  : parse_json_list,
        'courses'       : parse_json_list
    }
)
df['label'] = LABELS[ROLE]
df = df.drop_duplicates(subset='id')

# Experience (exp)

In [None]:
import json
import pandas as pd

# Assume `df` is your DataFrame and `COMPANY` is defined as the target company string
cur_comp = COMPANY.lower()

# Drop any rows whose current_company_name doesn't include the COMPANY keyword
df['current_company_name'] = (
    df['current_company_name']
      .fillna('')
      .astype(str)
      .str.lower()
      .str.strip()
)
df = df[df['current_company_name'].str.contains(cur_comp)].reset_index(drop=True)

# Parse the JSON‐encoded experience field
df['experience'] = df['experience'].apply(
    lambda s: json.loads(s) if isinstance(s, str) else s
)

# Flatten function to turn each past‐experience entry into text
def flatten_experience(entries, entry_sep=" [EXP] "):
    sentences = []
    for entry in entries or []:
        # top‐level fields
        parts = []
        for key in ['title', 'location', 'duration', 'description']:
            val = entry.get(key)
            if val:
                parts.append(str(val).strip())
        if parts:
            sentences.append(" | ".join(parts))
        # nested positions
        for pos in entry.get('positions', []):
            pos_parts = []
            for key in ['subtitle', 'meta', 'title', 'start_date', 'end_date', 'description', 'location']:
                val = pos.get(key)
                if val:
                    pos_parts.append(str(val).strip())
            if pos_parts:
                sentences.append(" | ".join(pos_parts))
    return entry_sep.join(sentences)

# Filter out any entries that belong to the current company
def filter_past(entries, cur_comp, current_company_name):
    filtered = []
    cc = current_company_name.lower().strip()
    for entry in entries or []:
        comp_norm = entry.get('company', '').lower().strip()
        # normalize aliases (e.g. "aws" → "amazon")
        if cur_comp in comp_norm:
            comp_norm = cur_comp
        is_current = (comp_norm == cc)
        if not is_current:
            filtered.append(entry)
    return filtered

# Apply both to produce the final `exp_text` column
df['exp_text'] = df.apply(
    lambda row: flatten_experience(
        filter_past(row['experience'], cur_comp, row['current_company_name'])
    ),
    axis=1
)

# Inspect
print(df[['id', 'current_company_name', 'exp_text']].head())

                  id current_company_name  \
0  robert-b-57b395b8               google   
1      joseph-kready               google   
2       swe-samantha               google   
3         nasamuffin               google   
4       rasya-ramesh               google   

                                            exp_text  
0  Software Engineer | Buffalo/Niagara, New York ...  
1  ANR Application Developer | Little Rock, Arkan...  
2                                                     
3  Maintainer - IPMI Stack | github.com/openbmc [...  
4  Teaching Assistant | Philadelphia, Pennsylvani...  


# Education (edu)

In [None]:
df['education'] = df['education'].apply(lambda s: json.loads(s) if isinstance(s, str) else s)

# Flatten education list-of-dicts
def flatten_education(entries, entry_sep=" [EDU] "):
    sentences = []
    for entry in entries or []:
        parts = []
        for key in ['title', 'degree', 'field', 'description', 'start_year', 'end_year']:
            val = entry.get(key)
            if val:
                parts.append(str(val).strip())
        # Compine all parts
        if parts:
            sentences.append(" | ".join(parts))
    return entry_sep.join(sentences)

# Generate edu_text
df['edu_text'] = df['education'].apply(flatten_education)

Unnamed: 0,id,edu_text
0,robert-b-57b395b8,Canisius College | Bachelor of Science (B.S.) ...
1,joseph-kready,The University of Texas at Austin | Master of ...
2,swe-samantha,Princeton University | Bachelor of Science in ...
3,nasamuffin,Northeastern University | Bachelor’s Degree | ...
4,rasya-ramesh,University of Pennsylvania | Master of Science...


# Projects (proj)

In [None]:
df['projects'] = df['projects'].apply(lambda s: json.loads(s) if isinstance(s, str) else s)

# Flatten projects list-of-dicts
def flatten_projects(entries, entry_sep=" [PRO] "):
    sentences = []
    for entry in entries or []:
        parts = []
        for key in ['title', 'start_date', 'end_date', 'description']:
            val = entry.get(key)
            if val:
                parts.append(str(val).strip())
        # Combine all parts
        if parts:
            sentences.append(" | ".join(parts))
    return entry_sep.join(sentences)

# 4. Generate proj_text
df['proj_text'] = df['projects'].apply(flatten_projects)

''

# Publications (pub)

In [None]:
df['publications'].iloc[14]

[]

In [None]:
df['publications'] = df['publications'].apply(lambda s: json.loads(s) if isinstance(s, str) else s)

def flatten_publications(entries, entry_sep=" [PUB] "):
    sentences = []
    for entry in entries or []:
        parts = []
        for key in ['title', 'subtitle', 'description']:
            val = entry.get(key)
            if val:
                parts.append(str(val).strip())
        if parts:
            sentences.append(" | ".join(parts))
    return entry_sep.join(sentences)

df['pub_text'] = df['publications'].apply(flatten_publications)

''

# Certifications

In [None]:
df['certifications'] = df['certifications'].apply(lambda s: json.loads(s) if isinstance(s, str) else s)

def flatten_certifications(entries, entry_sep=" [CER] "):
    sentences = []
    for entry in entries or []:
        parts = []
        for key in ['subtitle', 'title']:
            val = entry.get(key)
            if val:
                parts.append(str(val).strip())
        if parts:
            sentences.append(" | ".join(parts))
    return entry_sep.join(sentences)

df['cert_text'] = df['certifications'].apply(flatten_certifications)

''

# Courses

In [None]:
df['courses'] = df['courses'].apply(lambda s: json.loads(s) if isinstance(s, str) else s)

def flatten_courses(entries, entry_sep=" [COU] "):
    sentences = []
    for entry in entries or []:
        parts = []
        for key in ['title']:
            val = entry.get(key)
            if val:
                parts.append(str(val).strip())
        if parts:
            sentences.append(" | ".join(parts))
    return entry_sep.join(sentences)

df['course_text'] = df['courses'].apply(flatten_courses)

''

In [None]:
SEP = "[SEP]"

df['text'] = (
    "About: " + df['about'].fillna("") + SEP+
    "Experience: " + df['exp_text'].fillna("") + SEP+
    "Education: " + df['edu_text'].fillna("") + SEP+
    "Projects: " + df['proj_text'].fillna("") + SEP+
    "Publications" + df['pub_text'].fillna("") + SEP+
    "Certifications" + df['cert_text'].fillna("") + SEP+
    "Courses" + df['course_text'].fillna("") + SEP
)
texts = df['text'].fillna("").tolist()
new_df = df[['text','label']]
output = f'cleaned/BERT/cleaned_bert_{cur_comp}_{FILENAME.split("_")[2]}'
new_df.to_csv(output)

Unnamed: 0,text,label
0,About: Hello! 👋 I'm a software engineer that's...,3
1,About: Software Engineer at Google focused on ...,3
2,About: [SEP]Experience: [SEP]Education: Prince...,3
3,About: Driven to self-improvement and personal...,3
4,About: 🎓 Recent Computer & Information Science...,3
...,...,...
1259,About: An adaptive learner passionate about pr...,3
1260,About: Software developer. I am a senior major...,3
1261,About: [SEP]Experience: Software Engineer | Se...,3
1262,About: Experienced technical leader with a dem...,3
