In [4]:
import gc
# Clear all variables to free up memory
for name in dir():
    if not name.startswith('_') and name not in ['gc']:
        del globals()[name]

gc.collect()
print("All variables cleared and garbage collected")
import pandas as pd
import os


os.chdir('/shared/share_scp/coresignal')

All variables cleared and garbage collected


## 1. Read 'raw' files 
These are the files created from Python CSVs into Pandas

In [5]:
universities_adopted_facebook = pd.read_stata('universities_geo_for_jorge.dta')
universities_adopted_facebook.columns
print("Number of duplicate university IDs:", universities_adopted_facebook.unitid.duplicated().sum())

Number of duplicate university IDs: 0


In [6]:
import glob
education_files = sorted(glob.glob('coresignal_member_education_*_linkedin_matches_processed*.pkl'))
print(f'Found {len(education_files)} education files')
coresignal_member_education_all = pd.concat(
    [pd.read_pickle(f) for f in education_files],
    ignore_index=True
)

coresignal_member_education_all['member_id'] = coresignal_member_education_all['member_id'].astype(int)
coresignal_member_education_all = coresignal_member_education_all.drop_duplicates()
member_ids = coresignal_member_education_all['member_id'].astype(int).unique()
print(f"Number of unique member IDs: {len(member_ids)}")

Found 203 education files
Number of unique member IDs: 9509389


In [7]:
#Scratch
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(coresignal_member_education_all.year_from.value_counts())

year_from
2008    3215179
2007    2846673
2006    2608104
2005    2413066
2004    2247552
2003    2067014
2002    1932208
2001    1737022
2000    1583153
1999     967064
1998     698163
1997     196670
1996      71260
1995      35371
1994      20999
1989      14311
1993      13924
1992      12002
1990       9109
1988       8979
1991       8787
1987       4365
1986       3253
1985       2594
1984       2049
1980       2041
1983       1877
1982       1821
1979       1697
1978       1612
1981       1601
1977        881
1976        811
1975        756
1974        688
1973        661
1972        600
1970        504
1971        445
1969        362
1968        282
1967        252
1966        165
1965        161
1964        137
1900        134
1960        101
1962         88
1963         80
1961         72
1959         43
1958         40
1952         36
1950         27
1956         26
1953         18
1954         18
1955         17
1949         13
1951         13
1901         12
1904         1

In [None]:
total_rows = coresignal_member_education_all.shape[0]
coresignal_member_education_all = coresignal_member_education_all[~coresignal_member_education_all['school_url'].str.endswith('linkedin.com/edu/school')]
print(f"Total rows before filtering: {total_rows:,}")
print(f"Total rows after filtering:  {coresignal_member_education_all.shape[0]:,}")

In [None]:
import glob
from concurrent.futures import ThreadPoolExecutor, as_completed
import multiprocessing
import time

def read_pickle_file(filepath):
    return pd.read_pickle(filepath)

# Find all processed pickle files matching the pattern
os.chdir('/shared/share_scp/coresignal') #make sure it is in the right directory

processed_files = sorted(glob.glob('coresignal_member_experience_*START*.pkl'))
print(f'Found {len(processed_files):,} files with experience.')

# Use parallel processing with manual progress tracking
max_workers = min(len(processed_files), multiprocessing.cpu_count())
print(f'Using {max_workers} workers for parallel processing.')

dataframes = []
start_time = time.time()

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_file = {executor.submit(read_pickle_file, file): file 
                     for file in processed_files}
    
    # Process completed tasks and show progress
    for i, future in enumerate(as_completed(future_to_file), 1):
        dataframes.append(future.result())
        
        # Print progress every 10 files or at the end
        if i % 10 == 0 or i == len(processed_files):
            elapsed = time.time() - start_time
            rate = i / elapsed if elapsed > 0 else 0
            remaining = len(processed_files) - i
            eta = remaining / rate if rate > 0 else 0
            
            print(f"Progress: {i:,}/{len(processed_files):,} files ({i/len(processed_files)*100:.1f}%) | "
                  f"Rate: {rate:.1f} files/sec | "
                  f"Time Left: {eta:.0f}s | "
                  f"Elapsed: {elapsed:.0f}s")

print("Concatenating dataframes...")
all_experience = pd.concat(dataframes, ignore_index=True)

print(f"Total rows in all experience data: {len(all_experience):,}")


In [None]:
all_experience.columns

In [None]:
import time

start_time = time.time()
print(f"Total rows in all experience data: {len(all_experience):,}")
print("Removing duplicates...")
key_columns = ['member_id', 'company_name', 'title', 'date_from', 'date_to']
all_experience = all_experience.drop_duplicates(subset=key_columns)
print(f"Total rows in all experience data (after removing duplicates): {len(all_experience):,}")
elapsed_time = time.time() - start_time
minutes = int(elapsed_time // 60)
seconds = int(elapsed_time % 60)
print(f"Time taken to remove duplicates: {minutes}m {seconds}s")

In [None]:
# # Apply the cleaning function to create a new 'major' column
# print("Applying major extraction to the full dataset...")
# coresignal_member_education_all['major_cleaned'] = coresignal_member_education_all['subtitle'].apply(extract_major)

# # Check the results
# print(f"\nOriginal subtitle vs cleaned major (sample of 20):")
# sample_df = coresignal_member_education_all[['subtitle', 'major_cleaned']].dropna().sample(20, random_state=42)
# for idx, row in sample_df.iterrows():
#     print(f"'{row['subtitle']}' -> '{row['major_cleaned']}'")

# # Check most common majors after cleaning
# print(f"\nTop 20 most common cleaned majors:")
# major_counts = coresignal_member_education_all['major_cleaned'].value_counts()
# print(major_counts.head(20))

# print(f"\nTotal non-null majors after cleaning: {coresignal_member_education_all['major_cleaned'].notna().sum()}")
# print(f"Percentage of records with valid major: {coresignal_member_education_all['major_cleaned'].notna().mean()*100:.1f}%")

In [None]:
# Filter for bachelor's degrees or undergrad
coresignal_member_education = coresignal_member_education_all[
    coresignal_member_education_all['subtitle'].str.lower().str.contains(r'bachelor\'?s?|undergrad|\sb\.a\.|b\.s\.', na=False)
]
print(f"Filtered from {len(coresignal_member_education_all):,} to {len(coresignal_member_education):,} records for bachelor's or undergrad degrees.")

print(f"Deleting coresignal_member_education_all object")

del coresignal_member_education_all
gc.collect()

In [None]:
total_rows_all_experience = all_experience.shape[0]
all_experience = all_experience[all_experience['member_id'].isin(coresignal_member_education['member_id'])]
print(f"Filtered all_experience from {total_rows_all_experience:,} to {all_experience.shape[0]:,} rows based on member IDs in coresignal_member_education.")

In [None]:
coresignal_member_education.to_pickle('coresignal_member_education_AnalysisFile.pkl')
all_experience.to_pickle('all_experience_AnalysisFile.pkl')


## 2. Setup pandas files for research
Data loading from base files is done and now we add the things we are going to study

In [8]:
# Loads data from pickle files instead of re-creating it. 
# Uncomment as necessary

coresignal_member_education = pd.read_pickle('coresignal_member_education_AnalysisFile.pkl')
all_experience = pd.read_pickle('all_experience_AnalysisFile.pkl')


In [20]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(coresignal_member_education.year_to.value_counts().sort_index())

#Scratch

year_to
2002     478183
2003     539901
2004     607122
2005     674310
2006     712568
2007     773059
2008     851631
2009     927685
2010    1029517
2011     939599
2012     902845
2013     218357
2014      60181
2015      24304
2016      11856
2017       5886
2018       3459
2019       2247
2020       1898
2021        905
2022        489
2023        172
2024        152
2025         85
2026         39
2027         46
2028         27
2029          5
2030          4
2032          2
Name: count, dtype: Int64

In [116]:
coresignal_member_education.title.str.contains("name")

2           False
7           False
8           False
10          False
12          False
            ...  
23048459    False
23048460    False
23048464    False
23048471    False
23048488    False
Name: title, Length: 8766534, dtype: object

In [117]:
all_experience['job_from'] = all_experience['date_from'].str.extract(r'(\d{4})').astype(float)
all_experience['is_founder'] = all_experience['title'].str.contains('founder', case=False, na=False) | all_experience['title'].str.contains('owner', case=False, na=False)



### 2.1 Merge the education file and the universities file

In [118]:
graduates_with_education_job_level = pd.merge(all_experience, coresignal_member_education, on='member_id',suffixes=('_experience', '_education'))

for i in range(11):
    graduates_with_education_job_level[f'founder_in_year_{i}'] = (graduates_with_education_job_level['job_from'] == (graduates_with_education_job_level['year_to'] +i)) & graduates_with_education_job_level['is_founder']
    graduates_with_education_job_level[f'founder_in_year_{i}'] = graduates_with_education_job_level[f'founder_in_year_{i}'].fillna(False).astype(int)
    
graduates_with_education_job_level['worked_as_engineer'] = graduates_with_education_job_level['title_experience'].str.contains('engineer', case=False, na=False)
graduates_with_education_job_level['worked_in_sales'] = graduates_with_education_job_level['title_experience'].str.contains('sales', case=False, na=False)
graduates_with_education_job_level.shape

# 20_589_319 rows

(73436866, 39)

### 2.2 Add major to the graduate file

Create the graduate file, one line per graduate.

In [119]:
# Create binary indicators for founding within each year
for i in range(11):
    graduates_with_education_job_level[f'founded_within_{i}_years'] = (
        (graduates_with_education_job_level['job_from'] >= graduates_with_education_job_level['year_to']) & 
        (graduates_with_education_job_level['job_from'] <= graduates_with_education_job_level['year_to'] + i) & 
        graduates_with_education_job_level['is_founder']
    ).fillna(False).astype(int)

# Aggregate the data
graduates_person_level = graduates_with_education_job_level.groupby([
    'member_id', 'year_to', 'year_from', 'title_education', 'subtitle', 'unitid'
]).agg({
    **{f'founded_within_{i}_years': 'max' for i in range(11)},
    'job_from': 'count'
}).reset_index()

# Rename columns for clarity
graduates_person_level.rename(columns={
    'year_from': 'year_start_college',
    'year_to': 'year_end_college',
    'member_id': 'linkedin_member_id',
    'title_education': 'university_title',
    'subtitle': 'university_major_raw',
    'job_from': 'total_jobs'
}, inplace=True)

Keep only those graduates that obtained a bachelors based on the subtitle of the education row (this is a second cleaning)

In [120]:
# Remove rows where university_major_raw contains only generic degree terms without specific major
generic_patterns = [
    r'^bachelor\'?s?\s*degree$',
    r'^b\.?s\.?$',
    r'^b\.?a\.?$', 
    r'^bachelor\'?s?$',
    r'^degree$',
    r'^undergraduate$',
    r'^bachelor of science(\s*\(b\.?s\.?\))?$',
    r'^bachelor of arts(\s*\(b\.?a\.?\))?$',
    r'^bachelors$'
]

# Create a pattern that matches any of the generic patterns (case insensitive)
generic_pattern = '|'.join([f'({pattern})' for pattern in generic_patterns])

# Count rows before filtering
rows_before = len(graduates_person_level)

# Filter out rows with generic degree descriptions
graduates_person_level = graduates_person_level[
    ~graduates_person_level['university_major_raw'].str.lower().str.strip().str.match(generic_pattern, na=False)
]




rows_after = len(graduates_person_level)
print(f"Removed {rows_before - rows_after:,} rows with generic degree descriptions")
print(f"Remaining rows: {rows_after:,}")

Removed 122,916 rows with generic degree descriptions
Remaining rows: 4,323,188


In [121]:
# Clean university_major_raw by removing generic degree prefixes
import re

# Patterns to remove from the beginning of university_major_raw text
patterns_to_remove = [
    r'^bachelor of (applied\s+)?science(\s*\(b\.?s\.?\))?\s*,?\s*',
    r'^bachelor of arts(\s*\(b\.?a\.?\))?\s*,?\s*',
    r'^bachelor\'?s? degree\w?,']

# Combine all patterns
combined_pattern = '|'.join(patterns_to_remove)


# Apply cleaning (case insensitive)
graduates_person_level['university_major_clean'] = graduates_person_level['university_major_raw'].str.replace(
    combined_pattern, '', case=False, regex=True
).str.strip()

# Remove entries that are just short parenthetical notes (e.g., "(BA)", "(BS)")
graduates_person_level = graduates_person_level[~graduates_person_level['university_major_clean'].str.contains(r'^\(.{0:6}\)$')]
print(f"\nAfter cleaning - sample of university_major_raw:")



After cleaning - sample of university_major_raw:


In [122]:

graduates_person_level[['university_major_raw','university_major_clean']].sample(100).head(50)

Unnamed: 0,university_major_raw,university_major_clean
4042152,"Bachelor of Science - BS, Management Informati...","- BS, Management Information Systems, General"
2246918,"Bachelor's degree, Finance, General","Finance, General"
176232,"Bachelor of Science (BS), Business/Commerce, G...","Business/Commerce, General, Senior"
2780686,"Bachelors, History","Bachelors, History"
2311518,"Bachelor of Arts (BA), Psychology",Psychology
3848807,"Bachelor of Science - BS, Biological Sciences,...","- BS, Biological Sciences, Molecular & Cellula..."
2850613,"Bachelor of Arts (B.A.), History",History
3577044,"Bachelor's degree, Business Administration and...","Business Administration and Management, Genera..."
4120030,"Bachelor's degree, Business Administration and...","Business Administration and Management, General"
1120162,"Bachelor's Degree, Business Administration and...","Business Administration and Management, General"


### 2.3 Define the major categorization  keywords

In [123]:
x =  ['asian', 'hispanic', 'african','latin american','gender','feminist','asian american', 'african american','frech','russian','middle eastern','european','caribbean','women\'s','chicano','jewish']
studies_groups_social_science  = [g + ' studies' for g in x]

majors_categories = {
    "Engineering or Computer": {"keywords": ['engineering', 'computer', 'software', 'electronic', 'information systems', 'information technology', 'informatics', 'robotics', 'machine learning', 'artificial intelligence', 'cybersecurity',
                            'architecture', 'urban planning'],
                               "variable_name":"engineering_or_computer"},
    "Natural Science": {"keywords": ['biology', 'biological', 'chemistry', 'physics', 'environmental', 'geology', 'earth', 'astronomy', 'astrophysics', 'meteorology', 'biotechnology', 'biochemistry', 'biotech', 'biochem', 'neuroscience', 'marine', 'oceanography', 'ecology', 'genetics'],
                       "variable_name":"natural_science"},
    "Math": {"keywords": ['math', 'mathematics', 'statistics', 'statistical', 'stats', 'data science', 'analytics'],
             "variable_name":"math"},
    "Education": {"keywords": ['education', 'teacher', 'teaching', 'instructional', 'curriculum', 'pedagogy', 'educational','speech therapy'],
                  "variable_name":"education"},
    "Clinical Work": {"keywords": ['social work', 'pre-med', 'pharmacy', 'nursing', 'health', 'mental', 'therapy', 'clinical', 'counseling'],
                      "variable_name":"clinical_work"},
    "Law / Climinology": {"keywords": ['law', 'legal', 'criminology', 'criminal', 'justice', 'landscape'],
                          "variable_name":"law_climinology"},
    "Economics and Finance": {"keywords": ['economics', 'econ', 'finance', 'financial', 'banking', 'investment', 'econometrics'],
                              "variable_name":"economics_and_finance"},
    "Business (not Economics / Finance)": {"keywords": ['public relations','business', 'management', 'accounting', 'marketing', 'public relations', 'administration', 'advertising', 'human resources', 'operations', 'supply chain', 'organizational behavior'],
                                           "variable_name":"business_not_economics_finance"},
    "Social Science (not Economics)": {"keywords": (['social science', 'history', 'sociology', 'anthropology', 'international relations', 'political science', 'government',
                                                   'policy',  'ethnic', 'cultural', 'religion', 'philosophy','liberal art'] + 
                                studies_groups_social_science)
                                ,"variable_name":"social_science_not_economics"}
                               ,
    "Arts": {"keywords": ['fine art', 'design', 'graphic', 'music', 'theater', 'film', 'cinema', 'photography', 'fashion', 'visual', 'dance', 'performing'],
              "variable_name":"arts"},
    "Communications": {"keywords": ['communication', 'communications', 'media', 'journalism', 'broadcasting'],
                      "variable_name":"communications"},
    "English": {"keywords": ['english', 'literature', 'writing'],
                "variable_name":"english"},
    "Psychology": {"keywords": ['psychology'],
                   "variable_name":"psychology"}
}





In [124]:
subtitle =graduates_person_level['university_major_clean'].dropna().astype(str).sample(100, random_state=42).tolist()[1]

print(f"subtitle: {subtitle}")
subtitle_lower = subtitle.lower()
for major, keywords in majors_categories.items():
    for keyword in keywords:
        if keyword in subtitle_lower:
            print(f"major: {major}")


subtitle: Psychology


Run algorithm to categorize majors based on keywords, takes 10 mins or so

In [125]:
from tqdm import tqdm
tqdm.pandas()

# Assign major group to each observation in coresignal_member_education
def assign_major(subtitle, majors_categories):
    if pd.isna(subtitle):
        return None

    subtitle_lower = subtitle.lower()
    for major in majors_categories.keys():
        keywords = majors_categories[major]["keywords"]
        for keyword in keywords:
            if keyword in subtitle_lower:
                return (major, "major_" + majors_categories[major]["variable_name"])
    return ('Other', 'major_other')

print("Applying major assignment to the dataset...")

# Apply the assign_major function and extract results
major_results = graduates_person_level['university_major_clean'].progress_apply(
    lambda x: assign_major(x, majors_categories)
)

# Extract the major categories and variable names
major_categories = [result[0] if result else 'Other' for result in major_results]
major_variables = [result[1] if result else 'major_other' for result in major_results]

# Create university_major_categorized by joining multiple categories with semicolons
graduates_person_level['university_major_categorized'] = [
    '; '.join(sorted(set(cat.split('; ')))) if cat else 'Other' 
    for cat in major_categories
]

# Create dummy variables for each major category
for major_category, details in majors_categories.items():
    var_name = f"major_{details['variable_name']}"
    graduates_person_level[var_name] = [
        1 if var_name in str(major_var) else 0 
        for major_var in major_variables
    ]

# Add major_other dummy variable
graduates_person_level['major_other'] = [
    1 if major_var == 'major_other' else 0 
    for major_var in major_variables
]



Applying major assignment to the dataset...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4323188/4323188 [00:30<00:00, 142526.21it/s]


In [126]:
graduates_person_level.to_pickle('graduates_person_level_AnalysisFile.pkl')

In [127]:
# Print summary statistics for all major categories
print("Summary Statistics for Major Categories:")
print("=" * 50)

# Calculate percentages for each major category
total_graduates = len(graduates_person_level)
print(f"Total graduates: {total_graduates:,}")
print()

# Print statistics for each major category dummy variable
major_dummy_cols = [col for col in graduates_person_level.columns if col.startswith('major_')]
for col in major_dummy_cols:
    count = graduates_person_level[col].sum()
    percentage = (count / total_graduates) * 100
    print(f"{col:<35}: {count:>8,} ({percentage:>5.1f}%)")

print()
print("University Major Categorized Distribution:")
print("-" * 45)
major_dist = graduates_person_level['university_major_categorized'].value_counts()
for category, count in major_dist.items():
    percentage = (count / total_graduates) * 100
    print(f"{category:<35}: {count:>8,} ({percentage:>5.1f}%)")


Summary Statistics for Major Categories:
Total graduates: 4,323,188

major_engineering_or_computer      :  530,078 ( 12.3%)
major_natural_science              :  326,858 (  7.6%)
major_math                         :   51,694 (  1.2%)
major_education                    :  185,936 (  4.3%)
major_clinical_work                :  195,302 (  4.5%)
major_law_climinology              :  124,161 (  2.9%)
major_economics_and_finance        :  250,967 (  5.8%)
major_business_not_economics_finance: 1,065,164 ( 24.6%)
major_social_science_not_economics :  395,267 (  9.1%)
major_arts                         :  253,921 (  5.9%)
major_communications               :  222,077 (  5.1%)
major_english                      :  120,648 (  2.8%)
major_psychology                   :  201,879 (  4.7%)
major_other                        :  399,236 (  9.2%)

University Major Categorized Distribution:
---------------------------------------------
Business (not Economics / Finance) : 1,065,164 ( 24.6%)
Engineering o

In [128]:
""" from collections import Counter
import re

# Get all "Other" majors and extract the most common words
# Split all major strings into words and count frequency (4+ characters only)
all_words = []
for major in other_majors:
    # Convert to lowercase and extract words (letters only, 4+ characters)
    words = re.findall(r'\b[a-zA-Z]{4,}\b', major.lower())
    all_words.extend(words)

# Count word frequencies and get top 50
word_counts = Counter(all_words)
top_50_words = word_counts.most_common(50)

print("Top 50 most common words (4+ characters) in 'Other' category majors:")
for i, (word, count) in enumerate(top_50_words, 1):
    print(f"{i:2d}. {word:<20} ({count:,} occurrences)") """

' from collections import Counter\nimport re\n\n# Get all "Other" majors and extract the most common words\n# Split all major strings into words and count frequency (4+ characters only)\nall_words = []\nfor major in other_majors:\n    # Convert to lowercase and extract words (letters only, 4+ characters)\n    words = re.findall(r\'\x08[a-zA-Z]{4,}\x08\', major.lower())\n    all_words.extend(words)\n\n# Count word frequencies and get top 50\nword_counts = Counter(all_words)\ntop_50_words = word_counts.most_common(50)\n\nprint("Top 50 most common words (4+ characters) in \'Other\' category majors:")\nfor i, (word, count) in enumerate(top_50_words, 1):\n    print(f"{i:2d}. {word:<20} ({count:,} occurrences)") '

In [129]:
# Create a sample dataset showing 20 rows for each major category where the dummy variable is True
sample_data = []

# Get all major dummy variables
major_dummy_vars = [col for col in graduates_person_level.columns if col.startswith('major_')]

for major_var in major_dummy_vars:
    # Get 20 rows where this major dummy is True
    major_true_rows = graduates_person_level[graduates_person_level[major_var] == 1].sample(min(20, graduates_person_level[major_var].sum()), random_state=42)
    
    # Add the matched variable name to each row
    for _, row in major_true_rows.iterrows():
        sample_data.append({
            'matched_major_variable': major_var,
            'university_major_raw': row['university_major_raw'],
            'university_major_clean': row['university_major_clean']
        })

# Create the dataset
major_samples_df = pd.DataFrame(sample_data)
print(f"Created dataset with {len(major_samples_df)} rows showing major category matches")
print(f"Number of unique major variables: {major_samples_df['matched_major_variable'].nunique()}")

# Display first few rows
# Create a sample dataset showing 20 rows for each major category where the dummy variable is True
sample_data = []

# Get all major dummy variables
major_dummy_vars = [col for col in graduates_person_level.columns if col.startswith('major_')]

for major_var in major_dummy_vars:
    # Get 20 rows where this major dummy is True
    major_true_rows = graduates_person_level[graduates_person_level[major_var] == 1].sample(min(20, graduates_person_level[major_var].sum()), random_state=42)
    
    # Add the matched variable name to each row
    for _, row in major_true_rows.iterrows():
        sample_data.append({
            'matched_major_variable': major_var,
            'university_major_raw': row['university_major_raw']
        })

# Create the dataset
major_samples_df = pd.DataFrame(sample_data)
print(f"Created dataset with {len(major_samples_df)} rows showing major category matches")
print(f"Number of unique major variables: {major_samples_df['matched_major_variable'].nunique()}")


# Display the complete dataset with all rows
print("Displaying all rows in major_samples_df:")
with pd.option_context('display.max_rows', None, 'display.max_colwidth', None):
    display(major_samples_df)

Created dataset with 280 rows showing major category matches
Number of unique major variables: 14
Created dataset with 280 rows showing major category matches
Number of unique major variables: 14
Displaying all rows in major_samples_df:


Unnamed: 0,matched_major_variable,university_major_raw
0,major_engineering_or_computer,"Bachelor of Applied Science (B.A.Sc.), Computer Science"
1,major_engineering_or_computer,"Bachelor of Architecture (BArch), Architecture"
2,major_engineering_or_computer,"Bachelor's degree, Mechanical Engineering"
3,major_engineering_or_computer,"Bachelor of Science (BS), Geological/Geophysical Engineering"
4,major_engineering_or_computer,"Bachelor of Science, Aeronautical Engineering"
5,major_engineering_or_computer,"Bachelor of Science (BS), Information Technology"
6,major_engineering_or_computer,"B.S., Computer Science"
7,major_engineering_or_computer,"Bachelor Of Science, Computer Information Systems"
8,major_engineering_or_computer,"Bachelor of Science - BS, Electronic Media"
9,major_engineering_or_computer,"Bachelors, Civil Engineering"


In [130]:

#
print("Sampling 10 rows showing subtitle and major-related columns:")
major_cols = [col for col in graduates_person_level.columns if 'major' in col.lower()]
#graduates_person_level.sample(10)[major_cols]
with pd.option_context('display.max_colwidth', None):
    display(graduates_person_level[graduates_person_level.university_major_categorized.str.contains("Social Sc")].sample(50)[['university_major_raw','university_major_categorized'] ])

Sampling 10 rows showing subtitle and major-related columns:


Unnamed: 0,university_major_raw,university_major_categorized
3115282,"Bachelor of Liberal Arts, History and Political Science",Social Science (not Economics)
2806611,"Bachelor's degree, International Relations and Affairs",Social Science (not Economics)
4249924,"Bachelor of Arts (B.A.), Political Science and Government",Social Science (not Economics)
51460,"Bachelor's degree, Philosophy",Social Science (not Economics)
551553,"Bachelor of Arts (B.A.), Political Science",Social Science (not Economics)
1534917,"Bachelors of Arts, History",Social Science (not Economics)
1468494,"Bachelor's degree, Political Science and Government, Writing Intensive English",Social Science (not Economics)
3705291,"Bachelor of Arts (BA), Psychology, Sociology",Social Science (not Economics)
1323468,"Bachelor of Arts, Studio Art / Art History Double Major, cum laude",Social Science (not Economics)
3981021,"Bachelor's degree, Sociology",Social Science (not Economics)


In [131]:
graduates_person_level.sample(5000)[['university_major_raw','university_major_clean','university_major_categorized']].to_csv('graduates_person_level_majors_sample5000.csv', index=False)

In [132]:
# -----------------------------------------------------------
# Each row is one person
# University that they went to --> well matched into a single ID using Runjing's data.

# Start and end year and month of bachelor's degree
     # would be cool to show collaboration across groups. 

# Do they start a firm within 3, 5, or 10 years

# -----------------------------------------------------------
# Separate dataset of founding events: person id, firm id, founding date, location of firm. 

print("Storing graduates_person_level", flush=True)
#graduates_person_level = graduates_person_level[~graduates_person_level.linkedin_member_id.duplicated() ]
graduates_person_level.to_stata("graduates_person_level_091182025.dta",  version=118)




Storing graduates_person_level


In [18]:

#Scratch
#graduates_person_level = pd.read_stata("graduates_person_level_091182025.dta")


with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(graduates_person_level.year_start_college.value_counts().sort_index())

# Add: subtitle to a cleaned major
# Founded within years: finer from 1 to 10 years, every year -- make count per year, not cumulative.

year_start_college
1900        12
1901         1
1903         1
1904         4
1911         1
1915         1
1919         1
1921         1
1933         1
1939         1
1943         1
1950         2
1952         1
1954         1
1959         4
1960         4
1961         6
1962         2
1963         5
1964         9
1965        14
1966        14
1967        24
1968        34
1969        36
1970        34
1971        33
1972        55
1973        44
1974        47
1975        66
1976        73
1977        87
1978       184
1979       214
1980       167
1981       129
1982       148
1983       182
1984       189
1985       239
1986       358
1987       538
1988      1442
1989      2371
1990      1070
1991      1102
1992      1404
1993      1604
1994      2505
1995      4720
1996     11197
1997     41091
1998    160292
1999    210484
2000    286483
2001    298977
2002    332170
2003    366117
2004    414563
2005    453517
2006    505436
2007    561101
2008    662574
Name: count, dtype: i

In [134]:
founder_events.company_url.sample(20)
# Tag the LLCs, Partnership, or Corporation
# re-title variables to make it more clear.


# third file
# Need the company industry --- is there a company id to match from?

## Early employee? 

NameError: name 'founder_events' is not defined

In [None]:

print("Storing founder_events", flush=True)
founder_events = graduates_with_education_job_level[graduates_with_education_job_level['is_founder']].copy()
# Convert id column to string to avoid Stata export error
#founder_events['id'] = founder_events['id'].astype(str)
#founder_events.to_stata("founder_events.dta",   version=118)


# Convert all columns to string to avoid Stata export issues
for col in founder_events.columns:
    founder_events[col] = founder_events[col].astype(str)

founder_events.dtypes

founder_events.to_stata("founder_events.dta",  version=118)

Storing founder_events


In [None]:
founder_events

SyntaxError: invalid syntax (138444837.py, line 1)

## Testing internally of the graduate file


In [None]:
import seaborn as sns
from sklearn.linear_model import LinearRegression
import numpy as np

import matplotlib.pyplot as plt

# Load the graduates_person_level data if not already loaded
#graduates_person_level = pd.read_pickle('graduates_person_level_AnalysisFile.pkl')

# Prepare the data for regression
# Create dummy variables for college-year fixed effects
graduates_person_level['college_year'] = graduates_person_level['unitid'].astype(str) + '_' + graduates_person_level['year_end_college'].astype(str)

# Get dummies for college-year fixed effects
college_year_dummies = pd.get_dummies(graduates_person_level['college_year'], prefix='college_year')

# Prepare X variables (graduation year + college-year fixed effects)
X = pd.concat([
    graduates_person_level[['year_end_college']],
    college_year_dummies
], axis=1)

# Use founded_within_10_years as dependent variable
y = graduates_person_level['founded_within_10_years']

# Create interaction terms for major categories with graduation year
major_dummy_vars = [col for col in graduates_person_level.columns if col.startswith('major_')]

# Create interaction terms (graduation year * major dummy)
for major_var in major_dummy_vars:
    graduates_person_level[f'{major_var}_x_year'] = graduates_person_level[major_var] * graduates_person_level['year_end_college']

# Add interaction terms to X variables
interaction_cols = [col for col in graduates_person_level.columns if col.endswith('_x_year')]
X = pd.concat([
    graduates_person_level[['year_end_college']],
    graduates_person_level[major_dummy_vars],
    graduates_person_level[interaction_cols],
    college_year_dummies
], axis=1)

# Remove rows with missing values
mask = ~(X.isna().any(axis=1) | y.isna())
X_clean = X[mask]
y_clean = y[mask]

print(f"Running regression with {len(X_clean):,} observations")
print(f"Number of features: {X_clean.shape[1]}")

# Fit the regression
reg = LinearRegression()
reg.fit(X_clean, y_clean)

# Get coefficient for year_end_college
year_coef = reg.coef_[0]
print(f"Coefficient for year_end_college: {year_coef:.6f}")

# Plot the coefficient
plt.figure(figsize=(10, 6))
plt.bar(['Year of Graduation'], [year_coef], color='steelblue', alpha=0.7)
plt.title('Effect of Graduation Year on Founding a Company Within 10 Years')
plt.ylabel('Coefficient')
plt.xlabel('Variable')
plt.grid(axis='y', alpha=0.3)

# Add value label on bar
plt.text(0, year_coef + (abs(year_coef) * 0.1), f'{year_coef:.6f}', 
         ha='center', va='bottom' if year_coef > 0 else 'top')

plt.tight_layout()
plt.show()

# Print regression summary
print(f"\nRegression Results:")
print(f"R-squared: {reg.score(X_clean, y_clean):.6f}")
print(f"Number of observations: {len(X_clean):,}")
print(f"Mean of dependent variable: {y_clean.mean():.6f}")

In [138]:
graduates_person_level.columns

Index(['linkedin_member_id', 'year_end_college', 'year_start_college',
       'university_title', 'university_major_raw', 'unitid',
       'founded_within_0_years', 'founded_within_1_years',
       'founded_within_2_years', 'founded_within_3_years',
       'founded_within_4_years', 'founded_within_5_years',
       'founded_within_6_years', 'founded_within_7_years',
       'founded_within_8_years', 'founded_within_9_years',
       'founded_within_10_years', 'total_jobs', 'university_major_clean',
       'university_major_categorized', 'major_engineering_or_computer',
       'major_natural_science', 'major_math', 'major_education',
       'major_clinical_work', 'major_law_climinology',
       'major_economics_and_finance', 'major_business_not_economics_finance',
       'major_social_science_not_economics', 'major_arts',
       'major_communications', 'major_english', 'major_psychology',
       'major_other', 'college_year'],
      dtype='object')

In [None]:
import pandas as pd
from linearmodels.iv import AbsorbingLS

graduates_person_level['college_year'] = graduates_person_level['unitid'].astype(str) + '_' + graduates_person_level['year_end_college'].astype(str)

# Make FEs categorical to save memory
df = graduates_person_level.assign(
    fe1=lambda d: d.unitid.astype('category'),    
    fe_x=lambda d: d.year_end_college.astype('int16')    
).dropna(subset=['founded_within_10_years','year_end_college','fe1','fe_x'])

# Prepare dependent and independent variables
dependent = df['founded_within_10_years']
exog = pd.get_dummies(graduates_person_level['year_end_college'], prefix='xx')
absorb = df[['unitid']]   # Fixed effects variable



# Create and fit the model
mod = AbsorbingLS(dependent, exog, absorb=absorb)
res = mod.fit(
    cov_type='clustered',
    clusters=df[['fe1']]     # one- or multi-way clustering
)
print(res.summary)


                            Absorbing LS Estimation Summary                            
Dep. Variable:     founded_within_10_years   R-squared:                          0.0656
Estimator:                    Absorbing LS   Adj. R-squared:                     0.0656
No. Observations:                  4323188   F-statistic:                     1.355e+04
Date:                     Thu, Sep 18 2025   P-value (F-stat):                   0.0000
Time:                             11:55:55   Distribution:                     chi2(30)
Cov. Estimator:                  clustered   R-squared (No Effects):             0.0094
                                             Variables Absorbed:                 1.0000
                             Parameter Estimates                              
            Parameter  Std. Err.     T-stat    P-value    Lower CI    Upper CI
------------------------------------------------------------------------------
xx_2002        0.0563     0.0017     33.267     0.0000     

In [None]:

universities_adopted_facebook.fb_date.value_counts()
#graduates_person_level.columns
#universities_adopted_facebook.columns

fb_date
2005-05-07    119
2005-05-15     93
2005-04-18     76
2005-04-08     63
2005-03-02     60
2004-11-15     55
2004-10-13     52
2004-09-07     41
2005-01-14     39
2004-11-24     37
2004-10-27     28
2005-02-03     28
2004-09-24     10
2004-10-18      9
2004-08-08      8
2004-08-21      8
2004-04-25      5
2004-04-30      5
2004-04-19      4
2004-08-22      3
2004-04-04      3
2004-04-11      3
2004-10-15      3
2004-05-20      2
2004-08-04      2
2004-03-14      2
2004-03-21      2
2004-03-07      2
2004-06-25      2
2004-02-26      1
2004-04-27      1
2004-06-23      1
2004-02-29      1
2004-02-04      1
2004-10-10      1
2005-01-12      1
2004-05-02      1
2004-02-25      1
2005-05-26      1
2005-04-29      1
Name: count, dtype: int64

# Setup the industry file

In [93]:

# Read the company data CSV file efficiently
print("Reading coresignal_company.csv file...")
print("Note: This is a large file (11GB), loading may take several minutes...")

coresignal_company = pd.read_csv('coresignal_company.csv')
print(f"Company data loaded: {len(coresignal_company):,} rows, {len(coresignal_company.columns)} columns")
print("Columns in company data:", coresignal_company.columns.tolist())

Reading coresignal_company.csv file...
Note: This is a large file (11GB), loading may take several minutes...


  coresignal_company = pd.read_csv('coresignal_company.csv')


Company data loaded: 15,505,245 rows, 34 columns
Columns in company data: ['id', 'url', 'hash', 'name', 'website', 'size', 'industry', 'description', 'followers', 'founded', 'headquarters_city', 'headquarters_country', 'headquarters_state', 'headquarters_street1', 'headquarters_street2', 'headquarters_zip', 'logo_url', 'created', 'last_updated', 'last_response_code', 'type', 'headquarters_new_address', 'employees_count', 'headquarters_country_restored', 'headquarters_country_parsed', 'company_shorthand_name', 'company_shorthand_name_hash', 'canonical_url', 'canonical_hash', 'canonical_shorthand_name', 'canonical_shorthand_name_hash', 'deleted', 'last_updated_ux', 'source_id']


In [None]:
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import re

# First, let's examine the data we're working with
print("Founder events shape:", founder_events.shape)
print("Company data shape:", coresignal_company.shape)

# Check the company_url column in founder_events
print("\nCompany URL info in founder_events:")
print(f"Total founder events: {len(founder_events):,}")
print(f"Non-null company URLs: {founder_events['company_url'].notna().sum():,}")
print(f"Missing company URLs: {founder_events['company_url'].isna().sum():,}")

# Check if there's a URL column in coresignal_company
print("\nColumns in coresignal_company that might contain URLs:")
url_columns = [col for col in coresignal_company.columns if 'url' in col.lower() or 'link' in col.lower()]
print(url_columns)

# Also check for company name columns
name_columns = [col for col in coresignal_company.columns if 'name' in col.lower() or 'company' in col.lower()]
print("\nColumns in coresignal_company that might contain company names:")
print(name_columns)

NameError: name 'coresignal_company' is not defined

TO-DOS (TODOS) AS FROM THE MEETING ON 9/18

Update all data form graduates file to be from the year 2000 or earlier.


