In [46]:
import pandas as pd
import numpy as np
import os 
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder
import numpy as np
import joblib
from sklearn.ensemble import RandomForestClassifier

In [15]:
file_list = os.listdir("project-20-files")
dataframes = {}
for file_name in file_list:
    file_path = os.path.join("project-20-files", file_name) 
    df_name = os.path.splitext(file_name)[0]  
    dataframes[df_name] = pd.read_csv(file_path)

In [16]:
learn_dataframes = {key: df for key, df in dataframes.items() if "learn" in key.lower()}

merged_training = None

for name, df in learn_dataframes.items():
    if merged_training is None:
        # Use the first DataFrame as the base
        merged_training = df
    else:
        # Merge with the next DataFrame
        merged_training = pd.merge(merged_training, df, on="pkey", how="outer")

test_dataframes = {key: df for key, df in dataframes.items() if "test" in key.lower()}

merged_test = None

for name, df in test_dataframes.items():
    if merged_test is None:
        # Use the first DataFrame as the base
        merged_test = df
    else:
        # Merge with the next DataFrame
        merged_test = pd.merge(merged_test, df, on="pkey", how="outer")

In [17]:
# Filter dataframes that contain "city" in their key
city_dataframes = {key: df for key, df in dataframes.items() if "city" in key.lower()}

# Initialize the merged DataFrame
merged_city = None

# Loop through the filtered DataFrames and merge them on 'INSEE'
for name, df in city_dataframes.items():
    if merged_city is None:
        # Use the first DataFrame as the base
        merged_city = df
    else:
        # Merge with the next DataFrame on the 'INSEE' column
        merged_city = pd.merge(merged_city, df, on="INSEE", how="outer")

# Resulting merged DataFrame: merged_city


In [18]:
dep = pd.read_csv("project-20-files/departments.csv")
reg = pd.read_csv("project-20-files/regions.csv")

In [19]:
def preprocessing(data):
    data = pd.merge(data,merged_city,on = "INSEE", how = "inner")
    data = pd.merge(data,dep, on="dep")
    
    
    # not yet eligeable for retirement
    target_activity_types_y = ["TACT1-1", "TACT1-2", "TACT2-2", "TACT2-4", "TACT2-5"]

    # Define the specific columns for retired individuals
    retired_col = ["Previous_occupation_42", "previous_emp_type", "previous_dep", "retirement_pay"]

    # Apply logic to fill NaN values in 'retirement_age' only if activity_type is in target_activity_types_y
    data['retirement_age'] = data.apply(
    lambda row: 'not_relevant' if pd.isna(row['retirement_age']) and row['activity_type'] in target_activity_types_y 
        else row['retirement_age'],
        axis=1)

    # Select columns ending with '_y'
    columns_y = [col for col in data.columns if col.endswith('_y')]

    # Combine the lists of columns to include the retired columns as well
    all_target_columns = columns_y + retired_col
    
    # Loop through the combined list of columns (both '_y' and retired columns)
    for col in all_target_columns:
        # Check if the column exists in data
        if col in data.columns:
            # Check if the column is numerical
            if pd.api.types.is_numeric_dtype(data[col]):
                # For numerical columns, set 0 for matching activity types, keep existing values otherwise
                data[col] = data.apply(
                    lambda row: 0 if row["activity_type"] in target_activity_types_y and pd.isna(row[col]) else row[col],
                    axis=1
                )
            else:
                # For non-numerical columns, set 'not_relevant' for matching activity types
                data[col] = data.apply(
                    lambda row: "not_relevant" if row["activity_type"] in target_activity_types_y and pd.isna(row[col]) else row[col],
                    axis=1
                )
    
    
    
    ## There is a special group of 251 observations that are considered retired but that with most likelihood never worked.
    ## which very likely is people that has never worked due to some reason (old housewifes, institutionalized for instance)
    # this would explain lack of data regarding previous work
    
    
    csp_codes = ["csp_8_5", "csp_8_6"]
    
    # Function to apply the changes
    def fill_missing_values(row):
        # Check if Occupation_42 is one of the csp codes and the columns are NaN
        if row["Occupation_42"] in csp_codes:
            if pd.isna(row["Previous_occupation_42"]):
                row["Previous_occupation_42"] = "never worked"
            if pd.isna(row["previous_emp_type"]):
                row["previous_emp_type"] = "never worked"
            if pd.isna(row["retirement_age"]):
                row["retirement_age"] = "never worked"
        return row
    
    # Apply the function to the DataFrame
    data = data.apply(fill_missing_values, axis=1)
    
    
    
    
    ##
    ## Not working and will therefore not have any values for working related questions. 
    target_activity_types = ["TACT2-1", "TACT1-2","TACT2-2", "TACT2-4", "TACT2-5"]
    
    data.loc[data["activity_type"].isin(target_activity_types) & data["PAY"].isna(), "PAY"] = 0
    data.loc[data["activity_type"].isin(target_activity_types) & data["emp_type"].isna(), "emp_type"] = "not_relevant"
    
    # Select columns ending with '_x'
    columns_x = [col for col in data.columns if col.endswith('_x')]
    
    # Loop through the selected columns
    for col in columns_x:
        # Check if the column is numerical
        if pd.api.types.is_numeric_dtype(data[col]):
            # For numerical columns, set 0 for matching activity types, keep existing values otherwise
            data[col] = data.apply(
                lambda row: 0 if row["activity_type"] in target_activity_types and pd.isna(row[col]) else row[col],
                axis=1
            )
        else:
            # For non-numerical columns, set 'not_relevant' for matching activity types
            data[col] = data.apply(
                lambda row: "not_relevant" if row["activity_type"] in target_activity_types and pd.isna(row[col]) else row[col],
                axis=1
            )
   
    
    ## Sport people are either 
    data.loc[:, "SPORTS"] = data.loc[:, "SPORTS"].fillna("not_registered")


    ###
    data.loc[data['emp_type'].str.startswith('ec-2-1', na=False), 'employer_type_x'] = "ct_6"
    data.loc[data['emp_type'].str.startswith('ec-2-2', na=False), 'employer_type_x'] = "ct_9"
    data.loc[data['emp_type'].str.startswith('ec-2-3', na=False), 'employer_type_x'] = "ct_9"
    
    data.loc[data['previous_emp_type'].str.startswith('ec-2-1', na=False), 'employer_type_y'] = "ct_6"
    data.loc[data['previous_emp_type'].str.startswith('ec-2-2', na=False), 'employer_type_y'] = "ct_9"
    data.loc[data['previous_emp_type'].str.startswith('ec-2-3', na=False), 'employer_type_y'] = "ct_9"


    ###
    data.loc[data['previous_emp_type'].str.startswith('ec-2', na=False), 'work_condition_x'] = "N"
    data.loc[data['previous_emp_type'].str.startswith('ec-2', na=False), 'work_condition_y'] = "N"
    
    ###
    data.loc[data['work_desc_x'].str.startswith('231a', na=False), 'employee_count_x'] = "tr_6"
    data.loc[data['emp_type'].str.startswith('ec-2-1', na=False), 'employee_count_x'] = "tr_1"
    
    data.loc[data['work_desc_y'].str.startswith('231a', na=False), 'employee_count_y'] = "tr_6"
    data.loc[data['previous_emp_type'].str.startswith('ec-2-1', na=False), 'employee_count_y'] = "tr_1"
    
    ###
    data.loc[data['emp_type'].str.startswith('ec-1-6', na=False), 'Type_of_contract_x'] = "CDI"
    data.loc[data['emp_type'].str.startswith('ec-2', na=False), 'Type_of_contract_x'] = "No contract"
    
    data.loc[data['previous_emp_type'].str.startswith('ec-1-6', na=False), 'Type_of_contract_y'] = "CDI"
    data.loc[data['previous_emp_type'].str.startswith('ec-2', na=False), 'Type_of_contract_y'] = "No contract"


    ###
    data.loc[data['emp_type'].str.startswith('ec-2', na=False), 'JOB_CATEGORY_x'] = "not employee"
    data.loc[data['emp_type'].str.startswith('ec-1-6', na=False), 'JOB_CATEGORY_x'] = "O"
    
    data.loc[data['previous_emp_type'].str.startswith('ec-2', na=False), 'JOB_CATEGORY_y'] = "not employee"
    data.loc[data['previous_emp_type'].str.startswith('ec-1-6', na=False), 'JOB_CATEGORY_y'] = "O"

    ###
    data.loc[data['emp_type'].str.startswith('ec-2', na=False), 'PAY'] = 0

    data.loc[data['previous_emp_type'].str.startswith('ec-2', na=False), 'retirement_pay'] = 0


    ###
    data['working_but_nocon_income'] = data['emp_type'].apply(
        lambda x: 1 if str(x).startswith('ec-2') else 0)
        
    
    #priests
    data.loc[
        (data['Occupation_42'] == 'csp_4_4') & data['PAY'].isna(), 
        ['PAY', 'working_but_nocon_income']
    ] = [0, 1]
    
    data.loc[
        (data['Occupation_42'] == 'csp_4_4') & data['employer_type_x'].isna(), "employer_type_x"] = "ct_2" 
    
    #priests retired
    data.loc[
        (data['Previous_occupation_42'] == 'csp_4_4') & data['PAY'].isna(), 
        ['PAY', 'working_but_nocon_income']
    ] = [0, 1]
    
    data.loc[
        (data['Occupation_42'] == 'csp_4_4') & data['employer_type_x'].isna(), "employer_type_x"] = "ct_2" 
    
    
    data = data.drop(["JOB_CATEGORY_x", "JOB_CATEGORY_y"], axis=1)
    
    # Creating a new column 'fixed_contract' based on conditions applied to 'emp_type'
    def assign_fixed_contract(emp_type):
        if emp_type.startswith('ec-1') and emp_type != 'ec-1-6':
            return 'yes'
        elif emp_type == 'ec-1-6':
            return 'no contract'
        else:
            return 'no'
    
    data['fixed_contract'] = data['emp_type'].apply(assign_fixed_contract)
    
    return data 

In [20]:
merged_training = preprocessing(merged_training)

In [56]:
merged_training_sampled = merged_training.sample(frac=0.1, random_state=42)

In [58]:
# Copy the original dataset
df = merged_training_sampled.copy()
label_mapping = {'I': 0, 'N':1}
df['target'] = df['target'].map(label_mapping)
# Separate the target, features, and pkey
X = df.drop(columns=['target'])
y = df['target']

# Handle categorical features using Label Encoding
categorical_cols = X.select_dtypes(include=['object']).columns
encoder = LabelEncoder()
for col in categorical_cols:
    X[col] = encoder.fit_transform(X[col].astype(str))

# Split the dataset into training and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y  # Stratify based on the target variable
)


In [32]:
# Copy the original dataset
# Instantiate the XGBoost classifier
xgb_model = xgb.XGBClassifier(objective='binary:logistic', eval_metric='logloss')

# Hyperparameter grid for RandomizedSearchCV
param_dist = {
    'learning_rate': np.linspace(0.01, 0.3, 30),  # Keep this as is, a larger learning rate often helps explore the parameter space better
    'max_depth': [3, 5, 6, 7, 10, 12],  # Increase the depth range slightly to allow for more complex models
    'n_estimators': [200, 500, 1000, 1500, 2000],  # Increase n_estimators to allow for more boosting rounds
    'subsample': [0.7, 0.8, 0.9, 1.0],  # No changes, subsample is important for preventing overfitting
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],  # No changes, colsample_bytree is also important for preventing overfitting
    'gamma': [0, 0.1, 0.5, 1, 2, 5],  # Increased the range for gamma, which can help regularize and reduce overfitting
    'min_child_weight': [1, 3, 5, 7, 10],  # Expanded the range to allow more fine-tuned choices
    'scale_pos_weight': [1, 2, 5],  # This is useful if you have class imbalance
    'max_delta_step': [0, 1, 5]  # This is useful to stabilize training when there's class imbalance or other issues
}

# Randomized search over the parameter grid
random_search = RandomizedSearchCV(
    estimator=xgb_model, param_distributions=param_dist,
    n_iter=100, cv=3, verbose=1, n_jobs=-1, scoring='roc_auc'
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best model from RandomizedSearchCV
best_random_model = random_search.best_estimator_

y_pred = best_random_model.predict(X_test)  # Predicted labels

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1 Score: {f1 * 100:.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Accuracy: 88.39%
Precision: 78.56%
Recall: 87.64%
F1 Score: 82.85%

Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.89      0.91      6806
           1       0.79      0.88      0.83      3203

    accuracy                           0.88     10009
   macro avg       0.86      0.88      0.87     10009
weighted avg       0.89      0.88      0.89     10009



In [40]:
joblib.dump(best_random_model, 'best_xgb_model.pkl')

['best_xgb_model.pkl']

In [63]:
# Define the hyperparameter grid for RandomizedSearchCV
param_dist = {
    'n_estimators': [100, 200, 500, 1000],  # Number of trees
    'max_depth': [1, 10, 20, 30, 40],  # Depth of the tree
    'min_samples_split': [2, 5, 10, 20],  # Minimum samples required to split a node
    'min_samples_leaf': [1, 2, 4, 10],  # Minimum samples required to be at a leaf node
    'bootstrap': [True, False],  # Whether bootstrap samples are used when building trees
}

# Instantiate the RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=rf_model,
    param_distributions=param_dist,
    n_iter=100,  # Number of random combinations to try
    cv=3,  # Number of cross-validation folds
    verbose=2,  # Display the progress
    random_state=42,
    n_jobs=-1,  # Use all available cores
    scoring='roc_auc'  # Evaluate using ROC AUC
)

# Fit the RandomizedSearchCV on the training data
random_search.fit(X_train, y_train)

# Get the best model from the random search
best_rf_model = random_search.best_estimator_

# Make predictions using the best model
y_pred_rf = best_rf_model.predict(X_test)

# Calculate and print metrics
accuracy = accuracy_score(y_test, y_pred_rf)
precision = precision_score(y_test, y_pred_rf)
recall = recall_score(y_test, y_pred_rf)
f1 = f1_score(y_test, y_pred_rf)

print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'Precision: {precision * 100:.2f}%')
print(f'Recall: {recall * 100:.2f}%')
print(f'F1 Score: {f1 * 100:.2f}%')

print("\nClassification Report:")
print(classification_report(y_test, y_pred_rf))

# Print the best hyperparameters
print("Best Hyperparameters from RandomizedSearchCV:")
print(random_search.best_params_)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Accuracy: 79.32%
Precision: 75.36%
Recall: 50.64%
F1 Score: 60.57%

Classification Report:
              precision    recall  f1-score   support

           0       0.80      0.92      0.86       687
           1       0.75      0.51      0.61       314

    accuracy                           0.79      1001
   macro avg       0.78      0.72      0.73      1001
weighted avg       0.79      0.79      0.78      1001

Best Hyperparameters from RandomizedSearchCV:
{'n_estimators': 500, 'min_samples_split': 5, 'min_samples_leaf': 2, 'max_depth': 40, 'bootstrap': False}


In [69]:
joblib.dump(best_rf_model, 'best_rf_model.pkl')

['best_rf_model.pkl']

In [42]:
def xg_predict(data):
    df = data.copy()

    # Handle categorical features using Label Encoding
    categorical_cols = df.select_dtypes(include=['object']).columns
    encoder = LabelEncoder()
    for col in categorical_cols:
        df[col] = encoder.fit_transform(df[col].astype(str))
    # Predict numeric target using the trained model
    df["target"] = best_random_model.predict(df)
    
    # Define label mapping
    label_mapping = {0: 'I', 1: 'N'}
    
    # Map numeric predictions to categorical labels
    df["target"] = df["target"].map(label_mapping)

    df = df[["pkey","target"]]
    
    return df


In [24]:
# save data
xg_predict(merged_test).to_csv('predictions.csv', index=False, sep=',', decimal='.')


ValueError: feature_names mismatch: ['pkey', 'Occupation_42', 'Is_student', 'activity_type', 'INSEE', 'AGE_2018', 'DEGREE', 'SEX', 'household_type', 'emp_type', 'Type_of_contract_x', 'PAY', 'job_dep_x', 'employer_type_x', 'employee_count_x', 'work_condition_x', 'ECO_SECT_x', 'working_hours_x', 'work_desc_x', 'Previous_occupation_42', 'previous_emp_type', 'retirement_age', 'ECO_SECT_y', 'previous_dep', 'job_dep_y', 'work_desc_y', 'Type_of_contract_y', 'work_condition_y', 'employer_type_y', 'working_hours_y', 'employee_count_y', 'retirement_pay', 'SPORTS', 'Nom de la commune', 'Town_type', 'dep', 'X', 'Y', 'LAT', 'LONG', 'Inhabitants', 'Nom du département', 'REG', 'working_but_nocon_income', 'fixed_contract'] ['pkey', 'Occupation_42', 'Is_student', 'activity_type', 'INSEE', 'AGE_2018', 'DEGREE', 'SEX', 'household_type', 'emp_type', 'Type_of_contract_x', 'PAY', 'job_dep_x', 'employer_type_x', 'employee_count_x', 'work_condition_x', 'ECO_SECT_x', 'JOB_CATEGORY_x', 'working_hours_x', 'work_desc_x', 'Previous_occupation_42', 'previous_emp_type', 'retirement_age', 'ECO_SECT_y', 'previous_dep', 'job_dep_y', 'work_desc_y', 'Type_of_contract_y', 'JOB_CATEGORY_y', 'work_condition_y', 'employer_type_y', 'working_hours_y', 'employee_count_y', 'retirement_pay', 'SPORTS']
expected fixed_contract, LONG, dep, Nom de la commune, Town_type, REG, Inhabitants, Nom du département, Y, X, working_but_nocon_income, LAT in input data
training data did not have the following fields: JOB_CATEGORY_y, JOB_CATEGORY_x