In [13]:
import pandas as pd
# Load the dataset
train = pd.read_csv('/content/train_LZdllcl.csv')
test = pd.read_csv('/content/test_2umaH9m.csv')
sample_submission = pd.read_csv('/content/sample_submission_M0L0uXE.csv')
# Basic info
print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Train columns:\n", train.columns)
train.head()


Train shape: (54808, 14)
Test shape: (23490, 13)
Train columns:
 Index(['employee_id', 'department', 'region', 'education', 'gender',
       'recruitment_channel', 'no_of_trainings', 'age', 'previous_year_rating',
       'length_of_service', 'KPIs_met >80%', 'awards_won?',
       'avg_training_score', 'is_promoted'],
      dtype='object')


Unnamed: 0,employee_id,department,region,education,gender,recruitment_channel,no_of_trainings,age,previous_year_rating,length_of_service,KPIs_met >80%,awards_won?,avg_training_score,is_promoted
0,65438,Sales & Marketing,region_7,Master's & above,f,sourcing,1,35,5.0,8,1,0,49,0
1,65141,Operations,region_22,Bachelor's,m,other,1,30,5.0,4,0,0,60,0
2,7513,Sales & Marketing,region_19,Bachelor's,m,sourcing,1,34,3.0,7,0,0,50,0
3,2542,Sales & Marketing,region_23,Bachelor's,m,other,2,39,1.0,10,0,0,50,0
4,48945,Technology,region_26,Bachelor's,m,other,1,45,3.0,2,0,0,73,0


In [14]:
train.info()
train.isna().sum()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54808 entries, 0 to 54807
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   employee_id           54808 non-null  int64  
 1   department            54808 non-null  object 
 2   region                54808 non-null  object 
 3   education             52399 non-null  object 
 4   gender                54808 non-null  object 
 5   recruitment_channel   54808 non-null  object 
 6   no_of_trainings       54808 non-null  int64  
 7   age                   54808 non-null  int64  
 8   previous_year_rating  50684 non-null  float64
 9   length_of_service     54808 non-null  int64  
 10  KPIs_met >80%         54808 non-null  int64  
 11  awards_won?           54808 non-null  int64  
 12  avg_training_score    54808 non-null  int64  
 13  is_promoted           54808 non-null  int64  
dtypes: float64(1), int64(8), object(5)
memory usage: 5.9+ MB


Unnamed: 0,0
employee_id,0
department,0
region,0
education,2409
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,4124
length_of_service,0


In [15]:
# Fill missing education with mode
train['education'].fillna(train['education'].mode()[0], inplace=True)
test['education'].fillna(test['education'].mode()[0], inplace=True)

# Fill missing previous_year_rating with median
train['previous_year_rating'].fillna(train['previous_year_rating'].median(), inplace=True)
test['previous_year_rating'].fillna(test['previous_year_rating'].median(), inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  train['education'].fillna(train['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  test['education'].fillna(test['education'].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the interm

In [16]:
train.isna().sum()

Unnamed: 0,0
employee_id,0
department,0
region,0
education,0
gender,0
recruitment_channel,0
no_of_trainings,0
age,0
previous_year_rating,0
length_of_service,0


In [17]:
from sklearn.preprocessing import LabelEncoder

# Drop employee_id
train.drop('employee_id', axis=1, inplace=True)
test_ids = test['employee_id']
test.drop('employee_id', axis=1, inplace=True)

# Label encode categorical columns
cat_cols = ['department', 'region', 'education', 'gender', 'recruitment_channel']

le = LabelEncoder()
for col in cat_cols:
    train[col] = le.fit_transform(train[col])
    test[col] = le.transform(test[col])


In [18]:
from sklearn.model_selection import train_test_split

# Split features and target
X = train.drop('is_promoted', axis=1)
y = train['is_promoted']

# Split into train and validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Train shape:", X_train.shape)
print("Validation shape:", X_valid.shape)


Train shape: (43846, 12)
Validation shape: (10962, 12)


In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, f1_score

# Initialize and train model
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train, y_train)

# Predict on validation set
y_pred_lr = lr.predict(X_valid)

# Evaluate
print("Accuracy:", accuracy_score(y_valid, y_pred_lr))
print("F1 Score:", f1_score(y_valid, y_pred_lr))
print("Classification Report:\n", classification_report(y_valid, y_pred_lr))


Accuracy: 0.9177157453019522
F1 Score: 0.13766730401529637
Classification Report:
               precision    recall  f1-score   support

           0       0.92      1.00      0.96     10028
           1       0.64      0.08      0.14       934

    accuracy                           0.92     10962
   macro avg       0.78      0.54      0.55     10962
weighted avg       0.90      0.92      0.89     10962



In [20]:
from sklearn.ensemble import RandomForestClassifier

# Initialize and train Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Predict
y_pred_rf = rf.predict(X_valid)

# Evaluate
print("Accuracy:", accuracy_score(y_valid, y_pred_rf))
print("F1 Score:", f1_score(y_valid, y_pred_rf))
print("Classification Report:\n", classification_report(y_valid, y_pred_rf))



Accuracy: 0.9340448823207443
F1 Score: 0.42482100238663484
Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.97     10028
           1       0.83      0.29      0.42       934

    accuracy                           0.93     10962
   macro avg       0.88      0.64      0.69     10962
weighted avg       0.93      0.93      0.92     10962



In [21]:
from xgboost import XGBClassifier

# Initialize and train XGBoost
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb.fit(X_train, y_train)

# Predict
y_pred_xgb = xgb.predict(X_valid)

# Evaluate
print("Accuracy:", accuracy_score(y_valid, y_pred_xgb))
print("F1 Score:", f1_score(y_valid, y_pred_xgb))
print("Classification Report:\n", classification_report(y_valid, y_pred_xgb))


Parameters: { "use_label_encoder" } are not used.



Accuracy: 0.9417989417989417
F1 Score: 0.5107361963190185
Classification Report:
               precision    recall  f1-score   support

           0       0.94      1.00      0.97     10028
           1       0.90      0.36      0.51       934

    accuracy                           0.94     10962
   macro avg       0.92      0.68      0.74     10962
weighted avg       0.94      0.94      0.93     10962



In [22]:
# Predict on the final test data
final_preds = xgb.predict(test)

# Prepare submission
submission = pd.DataFrame({
    'employee_id': test_ids,
    'is_promoted': final_preds
})

# Save to CSV
submission.to_csv('submission.csv', index=False)


In [23]:
from google.colab import files
files.download('submission.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>