In [1]:
!ls

[30m[43m__pycache__[m[m                            modelRecruitment.ipynb
data_performance_management_update.csv modelRecruitment.py
data_recruitment_selection.csv         [30m[43mpackages[m[m
modelPerformance.ipynb                 [34mpackages_performance[m[m
modelPerformance.py


In [1]:
import os
import pickle
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')

# Mengubah data menjadi DataFrame
df = pd.read_csv('data_performance_management_update.csv')
df.sample(10)

Unnamed: 0,EmployeeID,Name,ReviewPeriod,Rating,Comments
168,438,Amber Owens,Q2 2023,2.7,Good performance
287,663,Jason Ortiz,Q3 2023,1.3,Good performance
122,347,Natalie Sheppard,Q3 2023,1.5,Very good performance
465,1016,Danielle Rose,Q4 2023,3.6,Needs improvement
170,442,Connor Moreno,Q2 2023,4.7,Good performance
43,177,William Walker,Q1 2023,4.9,Very good performance
7,111,Anthony Riley,Q3 2023,3.0,Good performance
440,965,Yolanda Simpson,Q1 2023,3.7,Needs improvement
189,478,Larry Hawkins,Q2 2023,3.5,Very good performance
81,252,Dr. Daniel Gregory,Q4 2023,3.9,Excellent performance


In [2]:
df['Comments'].value_counts()

Comments
Good performance         150
Excellent performance    144
Needs improvement        134
Very good performance    128
Name: count, dtype: int64

In [3]:
# Since your dataset doesn't include a binary classification target (OfferStatus in the previous example), you'll need to define one.
#  For example, if you're predicting whether an employee's performance is "Good" or "Needs Improvement," you could create a binary target based on the Comments or Rating column.
# Create a binary target column where:
# Rating >= 3: 1 (Good/Excellent Performance)
# Rating < 3: 0 (Needs Improvement/Low Performance)

In [4]:
df['PerformanceStatus'] = df['Rating'].apply(lambda x: 1 if x >= 2 else 0)


# Modify 'PerformanceStatus' based on Comments (if comment is 'Needs improvement', set to 0)
df['PerformanceStatus'] = df.apply(lambda row: 0 if row['Comments'] == 'Needs improvement' else row['PerformanceStatus'], axis=1)


y = 'PerformanceStatus'

# Menghapus kolom yang tidak diperlukan (sisa )
# Drop irrelevant columns
df = df.drop(columns=['EmployeeID', 'Name'])

# Menyimpan kolom untuk encoding dan scaling
categorical_cols = ['ReviewPeriod', 'Comments']
numerical_cols = ['Rating']

# Preprocessing directory
pathPackages = os.path.join(os.getcwd(), "packages_performance")
os.makedirs(pathPackages, exist_ok=True)

In [5]:
# Check the results
df[['Rating', 'Comments', 'PerformanceStatus']].head()


Unnamed: 0,Rating,Comments,PerformanceStatus
0,2.6,Good performance,1
1,1.5,Good performance,0
2,4.2,Good performance,1
3,4.7,Needs improvement,0
4,4.7,Excellent performance,1


In [6]:
def prepOneHotEncoder(df, col, pathPackages):
    # Load the pre-trained OneHotEncoder from the saved file
    filename = os.path.join(pathPackages, 'prep' + col + '.pkl')
    with open(filename, 'rb') as file:
        oneHotEncoder = pickle.load(file)
    
    # Transform the data using the pre-trained encoder
    dfOneHotEncoder = pd.DataFrame(oneHotEncoder.transform(df[[col]]),
                                   columns=[col + "_" + str(i + 1) for i in range(len(oneHotEncoder.categories_[0]))])
    
    # Drop the original column and add the one-hot encoded columns
    df = pd.concat([df.drop(col, axis=1), dfOneHotEncoder], axis=1)
    return df

def prepStandardScaler(df, col, pathPackages):
    # Load the pre-trained StandardScaler from the saved file
    filename = os.path.join(pathPackages, 'prep' + col + '.pkl')
    with open(filename, 'rb') as file:
        scaler = pickle.load(file)
    
    # Apply the scaler to the column
    df[col] = scaler.transform(df[[col]])
    # print(f"Preprocessing data {col} has been saved...")
    return df


In [7]:
# Apply preprocessing
for col in categorical_cols:
    df = prepOneHotEncoder(df, col, pathPackages)

for col in numerical_cols:
    df = prepStandardScaler(df, col, pathPackages)

# Separate features and target
X = df.drop(columns=[y]).values
y = df[y].values

# Melatih model
model = LogisticRegression()
model.fit(X, y)

# Menyimpan model dengan path yang benar
model_path = os.path.join(pathPackages, 'modelPerformance.pkl')
with open(model_path, 'wb') as file:
    pickle.dump(model, file)

print(f"Model telah dilatih dan disimpan di: {model_path}")


# print("Preprocessed DataFrame:")
# print(df)

Preprocessing data Rating has been saved...
Model telah dilatih dan disimpan di: /Users/jeannetaoliviasantoso/final_project/machine_learning/model/packages_performance/modelPerformance.pkl


In [8]:
import os
import pickle
import pandas as pd

# Your sample dataset
data = {
    'EmployeeID': [1065, 328, 111, 1001, 675, 902, 774, 473, 540, 190],
    'Name': ['Elizabeth Hale', 'Ryan Romero', 'Anthony Riley', 'Michelle Perez', 
             'Stephen Davis', 'Heidi Rocha', 'Tina Bowen', 'Eric Carr', 
             'Elizabeth Bailey', 'Kristine Newman'],
    'ReviewPeriod': ['Q1 2023', 'Q4 2023', 'Q3 2023', 'Q1 2023', 'Q3 2023', 
                     'Q2 2023', 'Q2 2023', 'Q1 2023', 'Q4 2023', 'Q2 2023'],
    'Rating': [2.6, 4.9, 3.0, 1.1, 1.3, 1.7, 3.8, 3.0, 1.5, 2.7],
    'Comments': ['Good performance', 'Good performance', 'Good performance', 
                 'Very good performance', 'Very good performance', 
                 'Very good performance', 'Excellent performance', 
                 'Good performance', 'Needs improvement', 
                 'Excellent performance']
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Relevant columns
categorical_cols = ['ReviewPeriod', 'Comments']
numerical_cols = ['Rating']

# Serialize the column names for modeling
path_packages = os.path.join(os.getcwd(), "packages_performance")
os.makedirs(path_packages, exist_ok=True)  # Create the folder if it doesn't exist
columns_to_save = categorical_cols + numerical_cols
pickle.dump(columns_to_save, open(os.path.join(path_packages, 'columnModelling.pkl'), 'wb'))

print(f"Columns for modeling saved: {columns_to_save}")


Columns for modeling saved: ['ReviewPeriod', 'Comments', 'Rating']


### TESTER

In [9]:
def runModel(data,path):
    pathPackages = os.path.join(path, "packages_performance")
    col = pickle.load(open(os.path.join(pathPackages, 'columnModelling.pkl'), 'rb'))
    df = pd.DataFrame(data, index=[0])
    df = df[col]

    # Preprocessing categorical columns
    categorical_cols = ['ReviewPeriod', 'Comments']
    for col in categorical_cols:
        df = prepOneHotEncoder(df, col, pathPackages)

    # Preprocessing numerical columns
    numerical_cols = ['Rating']
    for col in numerical_cols:
        df = prepStandardScaler(df, col, pathPackages)

    X = df.values
    model = pickle.load(open(os.path.join(pathPackages, 'modelPerformance.pkl'), 'rb'))
    y = model.predict(X)[0]
    
    return "Good Performance" if y == 1 else "Low Performance"


In [12]:
# Example data for prediction (ensure this data matches the format used for training)
new_data = {
    'ReviewPeriod': 'Q1 2023',
    'Rating': 5,
    'Comments': 'Low Performance',
}

path = os.getcwd()  # Current working directory
prediction = runModel(new_data, path)  # Run the model with the new data
if prediction:
    print(f"Prediction: {prediction}")  # Print the prediction result


Preprocessing data Rating has been saved...
Prediction: Good Performance


In [13]:
# def runModel(data, path):
#     # Load the columns used during model training (including one-hot encoded columns)
#     pathPackages = os.path.join(path, "packages_performance")
#     col = pickle.load(open(os.path.join(pathPackages, 'columnModelling.pkl'), 'rb'))
    
#     # Convert the incoming data to a DataFrame
#     df = pd.DataFrame(data, index=[0])

#     # Preprocess categorical columns using one-hot encoding
#     categorical_cols = ['ReviewPeriod', 'Comments']
#     for cat_col in categorical_cols:
#         df = prepOneHotEncoder(df, cat_col, pathPackages)
    
#     # Preprocess numerical columns using scaling
#     numerical_cols = ['Rating']
#     for num_col in numerical_cols:
#         df = prepStandardScaler(df, num_col, pathPackages)
    
#     # Print columns after preprocessing to debug
#     print("Columns after preprocessing:", df.columns)
    
#     # Ensure the column order matches the training set
#     missing_cols = [col_name for col_name in col if col_name not in df.columns]
#     extra_cols = [col_name for col_name in df.columns if col_name not in col]
    
#     # Check if there are missing or extra columns
#     if missing_cols or extra_cols:
#         print(f"The following columns are missing: {missing_cols}")
#         print(f"The following columns are extra: {extra_cols}")
        
#         # Align columns: add missing ones with zeros or drop extra ones
#         for missing_col in missing_cols:
#             df[missing_col] = 0  # Add missing columns as zeros
#         df = df[col]  # Reorder the columns to match the training set
        
#         # Print the final column names
#         print("Final columns after aligning:", df.columns)

#     # Convert df to the expected format (values)
#     X = df.values

#     # Load the trained model
#     model = pickle.load(open(os.path.join(pathPackages, 'modelPerformance.pkl'), 'rb'))

#     # Predict the performance
#     y = model.predict(X)[0]
    
#     # Return the prediction result
#     return "Good Performance" if y == 1 else "Low Performance"

# if __name__ == "__main__":
#     # Example data for prediction (ensure this data matches the format used for training)
#     new_data = {
#         'ReviewPeriod': 'Q1 2023',
#         'Rating': 3,
#         'Comments': 'Low Performance',
#     }

#     path = os.getcwd()  # Current working directory
#     prediction = runModel(new_data, path)  # Run the model with the new data
#     if prediction:
#         print(f"Prediction: {prediction}")  # Print the prediction result


In [14]:
df = pd.read_csv('data_performance_management_update.csv')

# Rename columns to match the model's input format if necessary
# df.rename(columns={'ReviewPeriod_1': 'ReviewPeriod', 'Comments_1': 'Comments'}, inplace=True)

path = os.getcwd()

# Use apply to run the model for each row
df['Prediction'] = df.apply(lambda row: runModel(row.to_dict(), path), axis=1)

# Optionally, display the results
# print(df[['ReviewPeriod', 'Comments', 'Rating', 'Prediction']])

Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has been saved...
Preprocessing data Rating has be

In [15]:
df.Prediction.value_counts()


Prediction
Good Performance    341
Low Performance     215
Name: count, dtype: int64