In [None]:
train_data preprocessing,modelling&tuning

In [5]:
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv('train_LZdllcl.csv')

# Display the first 5 rows
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(df.info())

| employee_id   | department        | region    | education        | gender   | recruitment_channel   | no_of_trainings   | age   | previous_year_rating   | length_of_service   | KPIs_met >80%   | awards_won?   | avg_training_score   | is_promoted   |
|:--------------|:------------------|:----------|:-----------------|:---------|:----------------------|:------------------|:------|:-----------------------|:--------------------|:----------------|:--------------|:---------------------|:--------------|
| 65438         | Sales & Marketing | region_7  | Master's & above | f        | sourcing              | 1                 | 35    | 5                      | 8                   | 1               | 0             | 49                   | 0             |
| 65141         | Operations        | region_22 | Bachelor's       | m        | other                 | 1                 | 30    | 5                      | 4                   | 0               | 0             | 60                   | 0       

In [6]:
# Print the number of missing values in each column
print('Number of missing values in each column:')
print(df.isnull().sum().to_markdown(numalign="left", stralign="left"))

# Print the number of duplicate rows
print(f'Number of duplicate rows: {df.duplicated().sum()}')

# Drop `employee_id`
df = df.drop(columns=['employee_id'])


Number of missing values in each column:
|                      | 0    |
|:---------------------|:-----|
| employee_id          | 0    |
| department           | 0    |
| region               | 0    |
| education            | 2409 |
| gender               | 0    |
| recruitment_channel  | 0    |
| no_of_trainings      | 0    |
| age                  | 0    |
| previous_year_rating | 4124 |
| length_of_service    | 0    |
| KPIs_met >80%        | 0    |
| awards_won?          | 0    |
| avg_training_score   | 0    |
| is_promoted          | 0    |
Number of duplicate rows: 0


In [7]:
# Impute missing values in `education` with the mode
df['education'].fillna(df['education'].mode()[0], inplace=True)

# Impute missing values in `previous_year_rating` with the median
df['previous_year_rating'].fillna(df['previous_year_rating'].median(), inplace=True)

# Print the number of missing values in each column after imputation
print('Number of missing values in each column after imputation:')
print(df.isnull().sum().to_markdown(numalign="left", stralign="left"))

# Print value counts for categorical columns
print('\nValue counts for categorical columns:')
for column in df.select_dtypes(include='object').columns:
    print(f"\nValue counts for '{column}':")
    print(df[column].value_counts().to_markdown(numalign="left", stralign="left"))

Number of missing values in each column after imputation:
|                      | 0   |
|:---------------------|:----|
| department           | 0   |
| region               | 0   |
| education            | 0   |
| gender               | 0   |
| recruitment_channel  | 0   |
| no_of_trainings      | 0   |
| age                  | 0   |
| previous_year_rating | 0   |
| length_of_service    | 0   |
| KPIs_met >80%        | 0   |
| awards_won?          | 0   |
| avg_training_score   | 0   |
| is_promoted          | 0   |

Value counts for categorical columns:

Value counts for 'department':
| department        | count   |
|:------------------|:--------|
| Sales & Marketing | 16840   |
| Operations        | 11348   |
| Technology        | 7138    |
| Procurement       | 7138    |
| Analytics         | 5352    |
| Finance           | 2536    |
| HR                | 2418    |
| Legal             | 1039    |
| R&D               | 999     |

Value counts for 'region':
| region    | count   |
|:

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV

# Separate features (X) and target (y)
X = df.drop('is_promoted', axis=1)
y = df['is_promoted']

# Identify categorical and numerical features
categorical_features = X.select_dtypes(include='object').columns
numerical_features = X.select_dtypes(exclude='object').columns

# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

# Create a preprocessor using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Create a pipeline with preprocessing and a Logistic Regression model
model_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', LogisticRegression(solver='liblinear', random_state=42))])

# Train the model
model_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = model_pipeline.predict(X_test)

# Evaluate the model
print('Initial Logistic Regression Model Performance:')
print(f'Accuracy: {accuracy_score(y_test, y_pred):.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Fine-tuning with GridSearchCV
print('\nFine-tuning Logistic Regression with GridSearchCV...')
param_grid = {
    'classifier__C': [0.1, 1.0, 10.0],
    'classifier__penalty': ['l1', 'l2']
}

grid_search = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train, y_train)

print(f'Best parameters found: {grid_search.best_params_}')
print(f'Best cross-validation accuracy: {grid_search.best_score_:.4f}')

# Evaluate the best model on the test set
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)

print('\nFine-tuned Logistic Regression Model Performance (on Test Set):')
print(f'Accuracy: {accuracy_score(y_test, y_pred_tuned):.4f}')
print('Classification Report:')
print(classification_report(y_test, y_pred_tuned))

Initial Logistic Regression Model Performance:
Accuracy: 0.9324
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     10028
           1       0.82      0.27      0.40       934

    accuracy                           0.93     10962
   macro avg       0.88      0.63      0.68     10962
weighted avg       0.93      0.93      0.92     10962


Fine-tuning Logistic Regression with GridSearchCV...
Best parameters found: {'classifier__C': 10.0, 'classifier__penalty': 'l1'}
Best cross-validation accuracy: 0.9320

Fine-tuned Logistic Regression Model Performance (on Test Set):
Accuracy: 0.9330
Classification Report:
              precision    recall  f1-score   support

           0       0.94      0.99      0.96     10028
           1       0.81      0.28      0.41       934

    accuracy                           0.93     10962
   macro avg       0.87      0.64      0.69     10962
weighted avg       0.93      0.93      0

In [None]:
test_data preprocessing

In [10]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('test_2umaH9m.csv')

# Display the first 5 rows
print(df.head().to_markdown(index=False, numalign="left", stralign="left"))

# Print the column names and their data types
print(df.info())

| employee_id   | department        | region    | education   | gender   | recruitment_channel   | no_of_trainings   | age   | previous_year_rating   | length_of_service   | KPIs_met >80%   | awards_won?   | avg_training_score   |
|:--------------|:------------------|:----------|:------------|:---------|:----------------------|:------------------|:------|:-----------------------|:--------------------|:----------------|:--------------|:---------------------|
| 8724          | Technology        | region_26 | Bachelor's  | m        | sourcing              | 1                 | 24    | nan                    | 1                   | 1               | 0             | 77                   |
| 74430         | HR                | region_4  | Bachelor's  | f        | other                 | 1                 | 31    | 3                      | 5                   | 0               | 0             | 51                   |
| 72255         | Sales & Marketing | region_13 | Bachelor's  | m        | o

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# --- 1. Load the datasets ---
# Ensure these CSV files are in the same directory as your script
# or provide the full path to them.
try:
    train_df = pd.read_csv('train_LZdllcl.csv')
    test_df = pd.read_csv('test_2umaH9m.csv')
except FileNotFoundError:
    print("Error: Make sure 'train_LZdllcl.csv' and 'test_2umaH9m.csv' are in the correct directory.")
    print("If not, please provide the full path to the files, e.g., pd.read_csv('/path/to/your/train_LZdllcl.csv')")
    exit() # Exit if files are not found

# Display initial information about the datasets
print("Train DataFrame Info:")
train_df.info()
print("\nTest DataFrame Info:")
test_df.info()

# --- 2. Separate target variable and drop unnecessary columns ---
# Separate target variable from the training data
X = train_df.drop('is_promoted', axis=1)
y = train_df['is_promoted']

# Store employee_ids for submission and drop from features
test_employee_ids = test_df['employee_id']
X = X.drop('employee_id', axis=1)
test_df = test_df.drop('employee_id', axis=1)

# --- 3. Identify Categorical and Numerical Features ---
categorical_cols = X.select_dtypes(include='object').columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns

# --- 4. Preprocessing Pipelines ---
# Create preprocessing pipelines for numerical and categorical features
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # Impute missing numerical values with the mean
    ('scaler', StandardScaler())                  # Scale numerical features
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')), # Impute missing categorical values with the mode
    ('onehot', OneHotEncoder(handle_unknown='ignore')) # One-hot encode categorical features, handle unknown categories gracefully
])

# Create a column transformer to apply different transformations to different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough' # Keep any other columns (not specified in num or cat) as they are
)

# --- 5. Model Training (RandomForestClassifier) ---
# Define the model
model = RandomForestClassifier(random_state=42)

# Create the full pipeline: Preprocessing + Classifier
full_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                ('classifier', model)])

# Split data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Define the parameter grid for GridSearchCV (for hyperparameter tuning)
# This grid has a reduced search space for quicker execution in an example
param_grid = {
    'classifier__n_estimators': [100, 200], # Number of trees in the forest
    'classifier__max_features': ['sqrt'],   # Number of features to consider when looking for the best split
    'classifier__max_depth': [6, 8],        # Maximum depth of the tree
    'classifier__criterion': ['gini']       # Function to measure the quality of a split
}

print("\nStarting GridSearchCV for hyperparameter tuning...")
grid_search = GridSearchCV(estimator=full_pipeline,
                           param_grid=param_grid,
                           cv=3, # Using 3-fold cross-validation for reasonable speed
                           n_jobs=-1, # Use all available CPU cores for parallel processing
                           verbose=2,
                           scoring='accuracy')

grid_search.fit(X_train, y_train)

print(f"\nBest parameters found: {grid_search.best_params_}")
print(f"Best cross-validation accuracy: {grid_search.best_score_:.4f}")

# Get the best model
best_model = grid_search.best_estimator_

# --- 6. Evaluation ---
# Evaluate the model on the validation set
print("\nEvaluating model on the validation set...")
y_pred_val = best_model.predict(X_val)
print(f"Validation Accuracy: {accuracy_score(y_val, y_pred_val):.4f}")
print("Classification Report on Validation Set:")
print(classification_report(y_val, y_pred_val))

# --- 7. Make Predictions on the Test Set ---
print("\nMaking predictions on the test set...")
test_predictions = best_model.predict(test_df)

# --- 8. Create Submission File (if needed) ---
submission_df = pd.DataFrame({'employee_id': test_employee_ids, 'is_promoted': test_predictions})
submission_df.to_csv('submission.csv', index=False)

print("\nProcess completed. 'submission.csv' has been created with test predictions.")

Train DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: