# Baseline Models


*   TF-IDF vectors to represent the texts.
*   Use several machine learning models.




In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm import tqdm

In [None]:
user_set = pd.read_csv("/content/drive/MyDrive/clean data/user_set.csv")
job_set = pd.read_csv("/content/drive/MyDrive/clean data/job_set_cleaned.csv")
work_history = pd.read_csv("/content/drive/MyDrive/clean data/work_history.csv")
dataset = pd.read_csv("/content/drive/MyDrive/clean data/dataset.csv")

In [None]:
dataset

Unnamed: 0,UserID,JobID,label
0,13,821691,1
1,13,622150,0
2,64,666073,1
3,64,281940,1
4,64,337025,1
...,...,...,...
240909,1472066,743091,0
240910,1472066,539643,0
240911,1472066,230868,0
240912,1472066,643000,0


In [None]:
job_set

Unnamed: 0,JobID,WindowID,Title,Description,Requirements,City,State,Country,Zip5,StartDate,EndDate
0,6,6,Administrative Assistant,Opportunity: Arbor Group is a full service in...,Required Skills/Education: 3 years work...,Bala Cynwyd,PA,US,19004.0,2012-05-31 17:27:01.63,2012-06-29 23:59:00
1,177,6,Maintenance Tech,Maintenance Tech - HVAC Certified. For lrg apt...,Please refer to the Job Description to view th...,Orlando,FL,US,32808.0,2012-05-13 01:16:58.923,2012-06-12 23:59:59
2,179,6,Podium Speaker,VACATION VILLAGE RESORTS PODIUM SPEAKER Are y...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-05-12 09:00:53.553,2012-06-11 23:59:59
3,181,6,Sales Professionals,VACATION VILLAGE RESORTS Seeking Sales Profess...,Please refer to the Job Description to view th...,Orlando,FL,US,,2012-05-12 09:00:55.86,2012-06-11 23:59:59
4,184,6,ADMINISTRATIVE ASSISTANT,ADMINISTRATIVE ASSISTANT —— For an exec...,Please refer to the Job Description to view th...,Sanford,FL,US,32773.0,2012-05-14 10:22:27.877,2012-06-13 23:59:59
...,...,...,...,...,...,...,...,...,...,...,...
115686,1116286,6,Billing Clerk,BILLING CLERK Now hiring a full-time Billing ...,See job description,Clarksville,TN,US,37040.0,2012-05-14 10:48:36.04,2012-06-13 23:59:59
115687,1116287,6,CNA - Certified Nurse Assistant,CNA - Certified Nurse Assistant Now seeking ca...,See job description.,Mount Juliet,TN,US,37122.0,2012-05-16 10:30:03.527,2012-06-15 23:59:00
115688,1116298,6,RN for Hand Surgeon,"Des Moines Orthopedic Surgeons, . a premier Or...",Candidates must have 1 – 3 years clinic and . ...,Des Moines,IA,US,,2012-05-17 17:05:34.687,2012-06-16 23:59:59
115689,1116299,6,Sales - Product Support Representative,Liftech Equipment Companies is a leading Indus...,Qualified candidates must have the ability t...,Lancaster,NY,US,14086.0,2012-06-01 17:00:07.793,2012-06-30 23:59:00


In [None]:
work_history

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
0,13,6,Test,1,Pennsylvania Mentor
1,13,6,Test,2,Student Worker
2,13,6,Test,3,Internship in Adoption Unit
3,13,6,Test,5,Student Worker - Continuing Education
4,13,6,Test,6,Sales Associate
...,...,...,...,...,...
112289,1471948,6,Train,3,Assistant (P/T)
112290,1471948,6,Train,4,Phone Sales
112291,1472019,6,Train,1,Supply Admin Clerk/ Combat Marksmanship
112292,1472066,6,Train,1,Manager


# 1. TF-IDF vectors for text representation

In [None]:
# about 1 min
job_set = job_set.fillna(" ")
job_set["word"] = job_set.Title + job_set.Description + job_set.Requirements
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=5, max_features=100, stop_words='english')
tfidf_matrix = tf.fit_transform(job_set['word'])

# 2. Filter out users with more than 10 applications

In [None]:
temp = sorted(dict(dataset.UserID.value_counts()).items(), key=lambda x: x[1], reverse=True)
exclude_user_id = [i[0] for i in temp if i [1]>=10]
len(exclude_user_id)

6766

In [None]:
dataset = dataset[~dataset.UserID.isin(exclude_user_id)]

In [None]:
user_id = dataset.UserID.unique()
work_history = work_history[work_history.UserID.isin(user_id)]
user_set = user_set[user_set.UserID.isin(user_id)]
user_set.reset_index(drop=True, inplace=True)

In [None]:
work_history = work_history.drop(columns=["Sequence"]).drop_duplicates()

In [None]:
word_history_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, max_features=50, stop_words='english')
word_history_tf_matrix = word_history_tf.fit_transform(work_history.groupby("UserID").JobTitle.sum().values)

# 3. Deal with the user set and the job set

In [None]:
user_set = user_set.drop(columns=["Country","ZipCode","Major","GraduationDate","WindowID"])

# In user_set

1. label encoding for DegreeType
2. one-hot encoding for State
3. binary labels for Currently Employed/ManagedOthers

In [None]:
# user_set = pd.get_dummies(user_set, columns=["State"])
user_set.replace({"CurrentlyEmployed":{"Yes":1,"No":0}}, inplace=True)
user_set.replace({"ManagedOthers":{"Yes":1,"No":0}}, inplace=True)
user_set.replace({"DegreeType":{"None":0,"High School":1, "Vocational":2, "Associate's":3, "Bachelor's":4, "Master's":5, "PhD":6}},
                 inplace=True)

In [None]:
user_set

Unnamed: 0,UserID,Split,City,State,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,13,Test,Philadelphia,PA,4,6,5.0,1,0,0
1,64,Train,Columbus,OH,5,3,22.0,1,0,0
2,101,Train,Brick,NJ,1,1,2.0,0,1,4
3,133,Train,Wilmington,DE,4,6,9.0,1,1,6
4,182,Train,Lenexa,KS,1,3,5.0,1,1,10
...,...,...,...,...,...,...,...,...,...,...
18745,1471625,Train,Indianapolis,IN,4,4,4.0,1,1,10
18746,1471661,Train,Shartlesville,PA,4,1,3.0,0,0,0
18747,1471838,Train,Peoria,AZ,5,3,8.0,1,0,0
18748,1471948,Train,Glendale,AZ,1,4,6.0,0,0,0


In [None]:
city = []
state = []
groups = dataset.groupby("UserID")
for idx, group in tqdm(groups):
    user_city = user_set[user_set.UserID==idx]["City"].values
    user_state = user_set[user_set.UserID==idx]["State"].values
    job_id_list = group.JobID.values
    job_city = job_set[job_set.JobID.isin(job_id_list)]["City"].values
    job_state = job_set[job_set.JobID.isin(job_id_list)]["State"].values
    city.extend([0 if i!=user_city else 1 for i in job_city])
    state.extend([0 if i!=user_state else 1 for i in job_state])
dataset["City"] = city
dataset["State"] = state

100%|██████████| 18750/18750 [02:15<00:00, 138.44it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["City"] = city
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset["State"] = state


In [None]:
user_set.to_csv("user_set_cleaned.csv", index=False)
dataset.to_csv("dataset_cleaned.csv", index=False)
work_history.to_csv("work_history_cleaned.csv", index=False)

In [None]:
user_set

Unnamed: 0,UserID,Split,City,State,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,13,Test,Philadelphia,PA,4,6,5.0,1,0,0
1,64,Train,Columbus,OH,5,3,22.0,1,0,0
2,101,Train,Brick,NJ,1,1,2.0,0,1,4
3,133,Train,Wilmington,DE,4,6,9.0,1,1,6
4,182,Train,Lenexa,KS,1,3,5.0,1,1,10
...,...,...,...,...,...,...,...,...,...,...
18745,1471625,Train,Indianapolis,IN,4,4,4.0,1,1,10
18746,1471661,Train,Shartlesville,PA,4,1,3.0,0,0,0
18747,1471838,Train,Peoria,AZ,5,3,8.0,1,0,0
18748,1471948,Train,Glendale,AZ,1,4,6.0,0,0,0


In [None]:
work_history

Unnamed: 0,UserID,WindowID,Split,JobTitle
0,13,6,Test,Pennsylvania Mentor
1,13,6,Test,Student Worker
2,13,6,Test,Internship in Adoption Unit
3,13,6,Test,Student Worker - Continuing Education
4,13,6,Test,Sales Associate
...,...,...,...,...
112287,1471948,6,Train,Court Judicial Specialist (F/T)
112288,1471948,6,Train,Cashier (P/T)
112289,1471948,6,Train,Assistant (P/T)
112290,1471948,6,Train,Phone Sales


In [None]:
dataset

Unnamed: 0,UserID,JobID,label,City,State
0,13,821691,1,0,0
1,13,622150,0,0,1
2,64,666073,1,0,0
3,64,281940,1,1,1
4,64,337025,1,1,1
...,...,...,...,...,...
240891,1471948,839717,1,0,0
240892,1471948,999612,0,0,0
240893,1471948,901538,0,1,1
240894,1472019,762334,1,0,0


# 4. Build the training set and testing set

In [None]:
train_user = user_set[user_set.Split=="Train"].UserID.values
test_user = user_set[user_set.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [None]:
groups = train_data.groupby("UserID")
X_train = np.zeros((1,158))
Y_train = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed",
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_train = np.concatenate((X_train, feature), axis=0)
    Y_train.extend(group.label.values.tolist())

100%|██████████| 18490/18490 [06:29<00:00, 47.48it/s]


In [None]:
X_train.shape, len(Y_train)

((70681, 158), 70680)

In [None]:
groups = test_data.groupby("UserID")
X_test = np.zeros((1,158))
Y_test = []
for u_id, group in tqdm(groups):
    user = user_set[user_set.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed",
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_test = np.concatenate((X_test, feature), axis=0)
    Y_test.extend(group.label.values.tolist())

100%|██████████| 260/260 [00:03<00:00, 78.14it/s] 


In [None]:
X_test.shape, len(Y_test)

((1055, 158), 1054)

In [None]:
np.save("X_train.npy",X_train[1:,])
np.save("Y_train.npy",np.array(Y_train))
np.save("X_test.npy",X_test[1:,])
np.save("Y_test.npy",np.array(Y_test))

In [None]:
X_train

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 5.        , ..., 0.11197664, 0.16158408,
        0.        ],
       [1.        , 1.        , 5.        , ..., 0.        , 0.05925864,
        0.0451224 ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.22285741, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.13949157, 0.06709618,
        0.05109028],
       [0.        , 1.        , 1.        , ..., 0.41873483, 0.        ,
        0.        ]])

In [None]:
df = pd.read_excel('/dfMerged_ID.xlsx')

# Add education information

(Source: U.S. Census Bureau):

High school graduation (including equivalency certificate): 89%

Bachelor's degree or above: 36%

Master's degree: 13.1%

Doctorate or professional degree: 4.2%

In [None]:
DegreeType = ["High School", "Vocational", "Associate's", "Bachelor's", "Master's", "PhD"]

In [None]:
p = [0.89 - 0.131 - 0.042 - 0.6, 0.138, 0.6 - 0.131 - 0.042, 0.36 - 0.131 - 0.042, 0.131 - 0.042, 0.042]

In [None]:
for org in df['Org'].unique():
    if pd.isna(org):
        df.loc[df['Org'].isna(), 'DegreeType'] = np.random.choice(DegreeType, p=p)
    else:
        df.loc[df['Org'] == org, 'DegreeType'] = np.random.choice(DegreeType, p=p)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Employee_ID,First_Name_x,Last_Name_x,YOB,Start_Time,End_Time,Change_Reason,Job_Role_ID_x,Job_Role_Level_x,...,Education_Level,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,Job_Role_Level,ManagedOthers,ManagedHowMany,City,State,DegreeType
0,0,100000290,Kevin,Torres,1996.0,2016-05-25,2016-09-20,Failure to Return from Leave,33434.0,2L,...,Associate's,199,0.323066,0,2.0,0,0,Martinsville,VA,Bachelor's
1,565,100000452,Cassandra,Munoz,1991.0,2019-03-11,2023-06-17,,,,...,Associate's,85,4.268309,1,,0,0,Martinsville,VA,Bachelor's
2,1130,100000452,Cassandra,Munoz,1991.0,2016-06-01,2019-04-01,Personal Reasons,15827.0,2L,...,Associate's,85,2.830938,0,2.0,0,0,Martinsville,VA,Bachelor's
3,1695,100001056,Victoria,Mcknight,1994.0,2016-07-01,2016-10-23,Job Abandonment,21044.0,2L,...,Associate's,10,0.312115,0,2.0,0,0,Martinsville,VA,Bachelor's
4,2260,100001637,Wendy,Forbes,1995.0,2016-07-11,2018-04-26,Personal Reasons,12552.0,2L,...,Associate's,17,1.790554,0,2.0,0,0,Martinsville,VA,Bachelor's


In [None]:
df['WorkHistoryCount'] = df.groupby(['First_Name_x', 'Last_Name_x'])['Job_Location_Country'].transform('count')
df.head()

Unnamed: 0.1,Unnamed: 0,Employee_ID,First_Name_x,Last_Name_x,YOB,Start_Time,End_Time,Change_Reason,Job_Role_ID_x,Job_Role_Level_x,...,Job_Role_Description,Job_ZipCode,Job_Location_City,Job_Location_State,Job_Location_Country,Job_Location_y,Department,Org,Education_Level,WorkHistoryCount
0,0,100000290,Kevin,Torres,1996.0,2016-05-25,2016-09-20,Failure to Return from Leave,33434.0,2L,...,,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,199
1,565,100000452,Cassandra,Munoz,1991.0,2019-03-11,,,,,...,,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,85
2,1130,100000452,Cassandra,Munoz,1991.0,2016-06-01,2019-04-01,Personal Reasons,15827.0,2L,...,,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,85
3,1695,100001056,Victoria,Mcknight,1994.0,2016-07-01,2016-10-23,Job Abandonment,21044.0,2L,...,,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,10
4,2260,100001637,Wendy,Forbes,1995.0,2016-07-11,2018-04-26,Personal Reasons,12552.0,2L,...,,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,17


In [None]:
from datetime import datetime

In [None]:
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['End_Time'] = pd.to_datetime(df['End_Time'])

# fill ：NaN to today
df['End_Time'].fillna(pd.to_datetime(datetime.today().strftime('%Y-%m-%d')), inplace=True)

# count TotalYearsExperience
df['TotalYearsExperience'] = (df['End_Time'] - df['Start_Time']).dt.days / 365.25

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Employee_ID,First_Name_x,Last_Name_x,YOB,Start_Time,End_Time,Change_Reason,Job_Role_ID_x,Job_Role_Level_x,...,Job_ZipCode,Job_Location_City,Job_Location_State,Job_Location_Country,Job_Location_y,Department,Org,Education_Level,WorkHistoryCount,TotalYearsExperience
0,0,100000290,Kevin,Torres,1996.0,2016-05-25,2016-09-20,Failure to Return from Leave,33434.0,2L,...,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,199,0.323066
1,565,100000452,Cassandra,Munoz,1991.0,2019-03-11,2023-06-17,,,,...,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,85,4.268309
2,1130,100000452,Cassandra,Munoz,1991.0,2016-06-01,2019-04-01,Personal Reasons,15827.0,2L,...,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,85,2.830938
3,1695,100001056,Victoria,Mcknight,1994.0,2016-07-01,2016-10-23,Job Abandonment,21044.0,2L,...,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,10,0.312115
4,2260,100001637,Wendy,Forbes,1995.0,2016-07-11,2018-04-26,Personal Reasons,12552.0,2L,...,"Martinsville, VA","Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,17,1.790554


In [None]:
df['CurrentlyEmployed'] = (df['End_Time'] == pd.to_datetime(datetime.today().strftime('%Y-%m-%d'))).astype(int)
df.head()

Unnamed: 0.1,Unnamed: 0,Employee_ID,First_Name_x,Last_Name_x,YOB,Start_Time,End_Time,Change_Reason,Job_Role_ID_x,Job_Role_Level_x,...,Job_Location_City,Job_Location_State,Job_Location_Country,Job_Location_y,Department,Org,Education_Level,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed
0,0,100000290,Kevin,Torres,1996.0,2016-05-25,2016-09-20,Failure to Return from Leave,33434.0,2L,...,"Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,199,0.323066,0
1,565,100000452,Cassandra,Munoz,1991.0,2019-03-11,2023-06-17,,,,...,"Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,85,4.268309,1
2,1130,100000452,Cassandra,Munoz,1991.0,2016-06-01,2019-04-01,Personal Reasons,15827.0,2L,...,"Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,85,2.830938,0
3,1695,100001056,Victoria,Mcknight,1994.0,2016-07-01,2016-10-23,Job Abandonment,21044.0,2L,...,"Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,10,0.312115,0
4,2260,100001637,Wendy,Forbes,1995.0,2016-07-11,2018-04-26,Personal Reasons,12552.0,2L,...,"Martinsville, VA",VA,USA,"Martinsville, VA",,Sales,Associate's,17,1.790554,0


In [None]:
df.Job_Role_Level_x.unique()

array(['2L', nan, '2AI', ' ', '3L', '2HA', '5YB', '5HB', '2N', '5YR',
       '4L', '2J', '4D', '1J', '5Y', '5YC', '2H', '1HB', '2HC', '1L',
       '1N', '2HB', '4G', '2IS', '5LV', '4S', '6Z', '3N', '3J', '5Z',
       '4T', '6YR', '4E', '2HJ', '5ZD', '2HE', '4HA', '6YB', '4C', '4AA',
       '5HA', '5ZP', '5E', '6G', '5G', '5D', '5ZC', '6YI', '5HD', '6Y',
       '5I', '5AB', '5T', '5CN', '5TN', '5HC', '1HA', '5AD', '5S', '6YE',
       '6AQ', '6YM', '5ZE', '4I', '5YD', '2AB', '10W', '6D', '6AK', '5ZB',
       '3IS', '5JV', '4ZD', '6T', '6E', '5M', '6I', '6DM', '6HA', '4M',
       '4R', '6M', '10V', '5C', '6TM', '1IS', '2', '6YD', '5GP', '6A',
       '10X', '6GM', '6R', '6EM', '6AD', '6YC', '6C', '2AC', '6S', '5AF',
       '5HH', '6AJ', '3AA', '5EP', '6YT', '6LM', '6AI', '4AB', '5AE',
       '6MM', '6YW', '6HG', '5EN', '11X', '6YA', '6AM', '5R', '6IM',
       '5ZA', '5A', '6RM', '6SM', '4ZE', '5TP', '6JM', '12X', '5GN',
       '6CM', '6AH'], dtype=object)

# When Job_Role_Level_x is greater than or equal to 3, ManagedOthers is 1, and others are 0.

1. ManagedHowMany is 10 when Job_Role_Level_x is equal to 3
2. ManagedHowMany is 50 when Job_Role_Level_x is equal to 4
3. ManagedHowMany is 100 when Job_Role_Level_x is equal to 5
4. ManagedHowMany is 500 when Job_Role_Level_x is equal to 6

In [None]:
df['Job_Role_Level'] = df['Job_Role_Level_x'].str.extract('(\d+)').astype(float)

# 创建 ManagedOthers 列
df['ManagedOthers'] = df['Job_Role_Level'].apply(lambda x: 1 if x >= 3 else 0)

# 创建 ManagedHowMany 列
def calculate_managed_how_many(level):
    if level == 3:
        return 10
    elif level == 4:
        return 50
    elif level == 5:
        return 100
    elif level == 6:
        return 500
    else:
        return 0

df['ManagedHowMany'] = df['Job_Role_Level'].apply(calculate_managed_how_many)

df.head()

Unnamed: 0.1,Unnamed: 0,Employee_ID,First_Name_x,Last_Name_x,YOB,Start_Time,End_Time,Change_Reason,Job_Role_ID_x,Job_Role_Level_x,...,Job_Location_y,Department,Org,Education_Level,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,Job_Role_Level,ManagedOthers,ManagedHowMany
0,0,100000290,Kevin,Torres,1996.0,2016-05-25,2016-09-20,Failure to Return from Leave,33434.0,2L,...,"Martinsville, VA",,Sales,Associate's,199,0.323066,0,2.0,0,0
1,565,100000452,Cassandra,Munoz,1991.0,2019-03-11,2023-06-17,,,,...,"Martinsville, VA",,Sales,Associate's,85,4.268309,1,,0,0
2,1130,100000452,Cassandra,Munoz,1991.0,2016-06-01,2019-04-01,Personal Reasons,15827.0,2L,...,"Martinsville, VA",,Sales,Associate's,85,2.830938,0,2.0,0,0
3,1695,100001056,Victoria,Mcknight,1994.0,2016-07-01,2016-10-23,Job Abandonment,21044.0,2L,...,"Martinsville, VA",,Sales,Associate's,10,0.312115,0,2.0,0,0
4,2260,100001637,Wendy,Forbes,1995.0,2016-07-11,2018-04-26,Personal Reasons,12552.0,2L,...,"Martinsville, VA",,Sales,Associate's,17,1.790554,0,2.0,0,0


In [None]:
df[['City', 'State']] = df['Job_Location_y'].str.split(', ', expand=True)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,Employee_ID,First_Name_x,Last_Name_x,YOB,Start_Time,End_Time,Change_Reason,Job_Role_ID_x,Job_Role_Level_x,...,Org,Education_Level,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,Job_Role_Level,ManagedOthers,ManagedHowMany,City,State
0,0,100000290,Kevin,Torres,1996.0,2016-05-25,2016-09-20,Failure to Return from Leave,33434.0,2L,...,Sales,Associate's,199,0.323066,0,2.0,0,0,Martinsville,VA
1,565,100000452,Cassandra,Munoz,1991.0,2019-03-11,2023-06-17,,,,...,Sales,Associate's,85,4.268309,1,,0,0,Martinsville,VA
2,1130,100000452,Cassandra,Munoz,1991.0,2016-06-01,2019-04-01,Personal Reasons,15827.0,2L,...,Sales,Associate's,85,2.830938,0,2.0,0,0,Martinsville,VA
3,1695,100001056,Victoria,Mcknight,1994.0,2016-07-01,2016-10-23,Job Abandonment,21044.0,2L,...,Sales,Associate's,10,0.312115,0,2.0,0,0,Martinsville,VA
4,2260,100001637,Wendy,Forbes,1995.0,2016-07-11,2018-04-26,Personal Reasons,12552.0,2L,...,Sales,Associate's,17,1.790554,0,2.0,0,0,Martinsville,VA


In [None]:
df.replace({"DegreeType":{"None":0,"High School":1, "Vocational":2, "Associate's":3, "Bachelor's":4, "Master's":5, "PhD":6}},
                 inplace=True)

In [None]:
my_user_set = df[['Employee_ID', 'City', 'State', 'DegreeType', 'WorkHistoryCount', 'TotalYearsExperience', 'CurrentlyEmployed', 'ManagedOthers', 'ManagedHowMany']]
my_user_set.head()

Unnamed: 0,Employee_ID,City,State,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany
0,100000290,Martinsville,VA,4,199,0.323066,0,0,0
1,100000452,Martinsville,VA,4,85,4.268309,1,0,0
2,100000452,Martinsville,VA,4,85,2.830938,0,0,0
3,100001056,Martinsville,VA,4,10,0.312115,0,0,0
4,100001637,Martinsville,VA,4,17,1.790554,0,0,0


In [None]:
np.random.seed(0)
my_user_set['Split'] = np.where(np.random.rand(len(df)) < 0.8, 'Train', 'Test')
my_user_set.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_user_set['Split'] = np.where(np.random.rand(len(df)) < 0.8, 'Train', 'Test')


Unnamed: 0,Employee_ID,City,State,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,Split
0,100000290,Martinsville,VA,4,199,0.323066,0,0,0,Train
1,100000452,Martinsville,VA,4,85,4.268309,1,0,0,Train
2,100000452,Martinsville,VA,4,85,2.830938,0,0,0,Train
3,100001056,Martinsville,VA,4,10,0.312115,0,0,0,Train
4,100001637,Martinsville,VA,4,17,1.790554,0,0,0,Train


In [None]:
np.random.seed(0)
my_user_set['Employee_ID'] = np.random.choice(dataset['UserID'], size=len(my_user_set))

my_user_set.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  my_user_set['Employee_ID'] = np.random.choice(dataset['UserID'], size=len(my_user_set))


Unnamed: 0,Employee_ID,City,State,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,Split
0,1396448,Martinsville,VA,4,199,0.323066,0,0,0,Train
1,850840,Martinsville,VA,4,85,4.268309,1,0,0,Train
2,831165,Martinsville,VA,4,85,2.830938,0,0,0,Train
3,905123,Martinsville,VA,4,10,0.312115,0,0,0,Train
4,432903,Martinsville,VA,4,17,1.790554,0,0,0,Train


In [None]:
my_user_set = my_user_set.rename(columns={'Employee_ID': 'UserID'})
my_user_set.head()

Unnamed: 0,UserID,City,State,DegreeType,WorkHistoryCount,TotalYearsExperience,CurrentlyEmployed,ManagedOthers,ManagedHowMany,Split
0,1396448,Martinsville,VA,4,199,0.323066,0,0,0,Train
1,850840,Martinsville,VA,4,85,4.268309,1,0,0,Train
2,831165,Martinsville,VA,4,85,2.830938,0,0,0,Train
3,905123,Martinsville,VA,4,10,0.312115,0,0,0,Train
4,432903,Martinsville,VA,4,17,1.790554,0,0,0,Train


In [None]:
user_ids = user_set['UserID'].unique()
weights = user_set['UserID'].value_counts().loc[user_ids].values
weights = weights / np.sum(weights) # normalize weights so they sum to 1

my_user_set['UserID'] = np.random.choice(user_ids, size=len(my_user_set), p=weights)

In [None]:
reduced_df = my_user_set.groupby('UserID').first()
reduced_df = reduced_df.reset_index()

In [None]:
train_user = reduced_df[reduced_df.Split=="Train"].UserID.values
test_user = reduced_df[reduced_df.Split=="Test"].UserID.values
train_data = dataset[dataset.UserID.isin(train_user)]
test_data = dataset[dataset.UserID.isin(test_user)]

In [None]:
user_set.shape

(18750, 10)

In [None]:
dataset.shape

(71734, 5)

In [None]:
groups = train_data.groupby("UserID")
X_train = np.zeros((1,158))
Y_train = []
for u_id, group in tqdm(groups):
    user = reduced_df[reduced_df.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed",
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_train = np.concatenate((X_train, feature), axis=0)
    Y_train.extend(group.label.values.tolist())

100%|██████████| 14794/14794 [05:37<00:00, 43.81it/s]


In [None]:
X_train.shape, len(Y_train)


((56677, 158), 56676)

In [None]:
groups = test_data.groupby("UserID")
X_test = np.zeros((1,158))
Y_test = []
for u_id, group in tqdm(groups):
    user = reduced_df[reduced_df.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed",
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]
    user_feature = np.concatenate((user.values, word_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values
    jobs = job_set[job_set.JobID.isin(job_id_list)]
    j_idx = jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_test = np.concatenate((X_test, feature), axis=0)
    Y_test.extend(group.label.values.tolist())

100%|██████████| 3705/3705 [00:43<00:00, 84.26it/s]


In [None]:
X_test.shape, len(Y_test)

((14065, 158), 14064)

In [None]:
reduced_df.to_csv("our_reduced_df.csv", index=False)

In [None]:
np.save("X_train.npy",X_train[1:,])
np.save("Y_train.npy",np.array(Y_train))
np.save("X_test.npy",X_test[1:,])
np.save("Y_test.npy",np.array(Y_test))

# 5. Construct models

In [None]:
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import accuracy_score, roc_auc_score, recall_score, precision_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
def show_result(y_true, y_prediction):
    report = classification_report(y_true,y_prediction,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    col_1.append("overall")
    col_2.append(precision_score(y_true, y_prediction))
    col_3.append(recall_score(y_true, y_prediction))
    col_4.append(f1_score(y_true, y_prediction))
    col_5.append(roc_auc_score(y_true, y_prediction))
    result = pd.DataFrame()
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    print(result)

In [None]:
X_train = np.load("X_train.npy")
X_test = np.load("X_test.npy")
Y_train = np.load("Y_train.npy")
Y_texs = np.load("Y_test.npy")

In [None]:
lr = LinearRegression()
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)
y_pred = [0 if i<0.5 else 1 for i in y_pred]
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5112    0.5082    0.5097      7032
1             1    0.5110    0.5139    0.5125      7032
2      accuracy                        0.5111     14064
3     macro avg    0.5111    0.5111    0.5111     14064
4  weighted avg    0.5111    0.5111    0.5111     14064
5       overall  0.511029  0.513936  0.512479  0.511092


In [None]:
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5124    0.5107    0.5115      7032
1             1    0.5123    0.5141    0.5132      7032
2      accuracy                        0.5124     14064
3     macro avg    0.5124    0.5124    0.5124     14064
4  weighted avg    0.5124    0.5124    0.5124     14064
5       overall   0.51233  0.514078  0.513203  0.512372


In [None]:
nb = GaussianNB()
nb.fit(X_train, Y_train)
y_pred = nb.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5100    0.5118    0.5109      7032
1             1    0.5101    0.5082    0.5092      7032
2      accuracy                        0.5100     14064
3     macro avg    0.5100    0.5100    0.5100     14064
4  weighted avg    0.5100    0.5100    0.5100     14064
5       overall  0.510061  0.508248  0.509153  0.510026


In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train, Y_train)
y_pred = dt.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.5957    0.6035    0.5996      7032
1             1    0.5983    0.5904    0.5943      7032
2      accuracy                        0.5970     14064
3     macro avg    0.5970    0.5970    0.5970     14064
4  weighted avg    0.5970    0.5970    0.5970     14064
5       overall  0.598271  0.590444  0.594332  0.596985


In [None]:
rf = RandomForestClassifier()
rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.6189    0.6389    0.6287      7032
1             1    0.6268    0.6065    0.6165      7032
2      accuracy                        0.6227     14064
3     macro avg    0.6229    0.6227    0.6226     14064
4  weighted avg    0.6229    0.6227    0.6226     14064
5       overall  0.626837  0.606513  0.616508  0.622725
