# Test Data Labelling

### import librairies

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

### Load your Excel data into a DataFrame

In [2]:
file_path = './test_data.xlsx'  
df_test = pd.read_excel(file_path)

# Display the initial structure of the DataFrame
print("Initial DataFrame:")
print(df_test.info())
pd.DataFrame(df_test)


Initial DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 21 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 24 non-null     int64  
 1   Nom                                24 non-null     object 
 2   Prénom                             24 non-null     object 
 3   Fonction                           24 non-null     object 
 4   Gender                             24 non-null     object 
 5   Domaine                            24 non-null     object 
 6   Niveau                             24 non-null     object 
 7   ColonneNiveau                      24 non-null     int64  
 8   Niveau d'experience en conception  24 non-null     object 
 9   ColonneExperience                  24 non-null     int64  
 10  Localisation                       24 non-null     object 
 11  Salaire Actuel                     0 non-

Unnamed: 0,ID,Nom,Prénom,Fonction,Gender,Domaine,Niveau,ColonneNiveau,Niveau d'experience en conception,ColonneExperience,...,Salaire Actuel,Prétention,Préavis,Commentaire,TJM,target,Source,Url,Colonne1,ID2
0,1,SADAK,Marouane,ingénieur en mécanique : spécialité aéronauti...,M,Ingénieur Mécanique,Bac+6,6,5 ans,5,...,,,,,,,Linkedin,https://ma.linkedin.com/in/marouane-sadak-3059...,marouane-sadak-3059941b4,
1,2,ZAKARIA,Mohammed,ingénieur en mécanique,M,Ingénieur Mécanique,Bac+6,6,5 ans,5,...,,,,,,,Linkedin,https://www.linkedin.com/in/zakaria-mohammed-5...,,
2,3,ZALIM,Abderrahim,Ingénieur d’état en mécanique | Procédés indus...,M,Ingénieur Mécanique,Bac+5,5,3 ans,3,...,,,,,,,Linkedin,https://www.linkedin.com/in/abderrahim-zalim-2...,,
3,4,ASLI,Othmane,Ingénieur Industriel,M,Ingénieur Industriel,Bac+5,5,2 ans,2,...,,,,,,,Linkedin,https://www.linkedin.com/in/othmane-asli-a8a3a...,,
4,5,LOUGNIDI,Ammar,"Ingénieur d'état génie industriel productique,...",M,Ingénieur Industriel,Bac+8,8,5 ans,5,...,,,,,,,Linkedin,https://www.linkedin.com/in/ammar-lougnidi-a77...,,
5,6,SEKKAT,Omar,Ingénieur Industriel,M,Ingénieur Industriel,Bac+5,5,12 ans,12,...,,,,,,,Linkedin,https://www.linkedin.com/in/omar-sekkat-0177664a/,,
6,7,LAKHAL,Youssef,Ingénieur process et méthodes chez Siemens Gamesa,M,Ingénieur Process,Bac+5,5,6 ans,6,...,,,,,,,Linkedin,https://www.linkedin.com/in/youssef-lakhal-aa3...,,
7,8,BOUNOUAR,Yassine,Responsable Process et Méthodes chez FUJIKURA ...,M,Ingénieur Process,Bac+2,2,20 ans,20,...,,,,,,,Linkedin,https://www.linkedin.com/in/yassine-bounouar-5...,,
8,9,LIMANI,Said,Ingénieur Process,M,Ingénieur Process,Bac+5,5,8 ans,8,...,,,,,,,Linkedin,https://www.linkedin.com/in/saidlimani/,,
9,10,BAKKALI,Ayoub,Ingénieur qualité projet,M,Ingénieur Qualité,Bac+5,5,10 ans,10,...,,,,,,,Linkedin,https://ma.linkedin.com/in/ayoub-bakkali-3a180...,,


### Apply the scalling

In [3]:
columns_to_normalize = ['ColonneExperience','ColonneNiveau']

scaler = MinMaxScaler()

df_test_normalized = pd.DataFrame(scaler.fit_transform(df_test[columns_to_normalize]), columns=[f'Normalized_{col}' for col in columns_to_normalize])

df_test = pd.concat([df_test, df_test_normalized], axis=1)

# Display the DataFrame with normalized columns
print("\nDataFrame with Normalized Columns:")
df_test.round(2)
print(df_test.info())
print(df_test.head())


DataFrame with Normalized Columns:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 23 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 24 non-null     int64  
 1   Nom                                24 non-null     object 
 2   Prénom                             24 non-null     object 
 3   Fonction                           24 non-null     object 
 4   Gender                             24 non-null     object 
 5   Domaine                            24 non-null     object 
 6   Niveau                             24 non-null     object 
 7   ColonneNiveau                      24 non-null     int64  
 8   Niveau d'experience en conception  24 non-null     object 
 9   ColonneExperience                  24 non-null     int64  
 10  Localisation                       24 non-null     object 
 11  Salaire Actuel          

### labelling the data

In [4]:
# Define weights for each normalized column (customize based on your requirements)
weights = {'ColonneExperience': 0.6, 'ColonneNiveau': 0.4}

# Example: Calculate a weighted score based on normalized columns
df_test['Weighted_Score'] = sum(df_test[f'Normalized_{col}'] * weights[col] for col in weights)

# threshold=df['Weighted_Score'].mean()
# threshold = round(df['Weighted_Score'].mean(), 2)
threshold = 0.214
df_test['Output'] = (df_test['Weighted_Score'] >= threshold).astype(int)

# Display the DataFrame with the added Weighted_Score column
print("\nDataFrame with Weighted Score:")
df = df_test.round(2)
print(df_test.head())


# Save the results to a new Excel file
output_file_path = './labelled_test_data_60_40.csv'  # Replace with your desired output file path
df_test.to_csv(output_file_path, index=False)
print(f"Results saved to: {output_file_path}")


DataFrame with Weighted Score:
   ID       Nom      Prénom   
0   1     SADAK    Marouane  \
1   2   ZAKARIA    Mohammed   
2   3     ZALIM  Abderrahim   
3   4      ASLI     Othmane   
4   5  LOUGNIDI       Ammar   

                                            Fonction Gender   
0   ingénieur en mécanique : spécialité aéronauti...      M  \
1                             ingénieur en mécanique      M   
2  Ingénieur d’état en mécanique | Procédés indus...      M   
3                               Ingénieur Industriel      M   
4  Ingénieur d'état génie industriel productique,...      M   

                Domaine Niveau  ColonneNiveau   
0   Ingénieur Mécanique  Bac+6              6  \
1   Ingénieur Mécanique  Bac+6              6   
2   Ingénieur Mécanique  Bac+5              5   
3  Ingénieur Industriel  Bac+5              5   
4  Ingénieur Industriel  Bac+8              8   

  Niveau d'experience en conception  ColonneExperience  ... TJM  target   
0                             5 

# Test Data cleaning

### Importing the libraries

In [5]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder

### loading the data

In [6]:
data = pd.read_csv('labelled_test_data_60_40.csv')
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 25 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   ID                                 24 non-null     int64  
 1   Nom                                24 non-null     object 
 2   Prénom                             24 non-null     object 
 3   Fonction                           24 non-null     object 
 4   Gender                             24 non-null     object 
 5   Domaine                            24 non-null     object 
 6   Niveau                             24 non-null     object 
 7   ColonneNiveau                      24 non-null     int64  
 8   Niveau d'experience en conception  24 non-null     object 
 9   ColonneExperience                  24 non-null     int64  
 10  Localisation                       24 non-null     object 
 11  Salaire Actuel                     0 non-null      float64
 

### Dropping unnecessary columns

In [7]:

df=data
df = df.drop(['ID', 'Nom', 'Prénom', 'Fonction', 'Niveau', 'ColonneNiveau', 
              "Niveau d'experience en conception", 'ColonneExperience', 
              'Localisation', 'Salaire Actuel', 'Prétention', 'Préavis', 
              'Commentaire', 'TJM', 'target', 'Source', 'Url', 'Colonne1', 
              'ID2'], axis=1)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 6 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Gender                        24 non-null     object 
 1   Domaine                       24 non-null     object 
 2   Normalized_ColonneExperience  24 non-null     float64
 3   Normalized_ColonneNiveau      24 non-null     float64
 4   Weighted_Score                24 non-null     float64
 5   Output                        24 non-null     int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 1.3+ KB


### Displaying how many persons per domain

In [8]:
domain_counts = df['Domaine'].value_counts()
print(domain_counts)
c = df['Domaine'].shape[0]
print('\nlength domain :',c )

Domaine
Ingénieur Mécanique        3
Ingénieur Industriel       3
Ingénieur Process          3
Ingénieur Qualité          3
Concepteur/ Dessinateur    3
Chargé de Développement    3
Logistique                 3
Technicien spécialisé      3
Name: count, dtype: int64

length domain : 24


### Rename columns

In [9]:
df.rename(columns = {'Domaine':'Domain'}, inplace = True) 
df.rename(columns = {'Normalized_ColonneExperience':'Experience'}, inplace = True) 
df.rename(columns = {'Normalized_ColonneNiveau':'Niveau'}, inplace = True) 
df['Domain'] = df['Domain'].replace('Ingénieur Industriel', 'ingénieur industriel')

### Display missing values

In [10]:
missing_values = df.isnull().sum()
print("\nMissing Values:")
print(missing_values)



Missing Values:
Gender            0
Domain            0
Experience        0
Niveau            0
Weighted_Score    0
Output            0
dtype: int64


### Encode categorical columns

In [11]:
label_encoder = LabelEncoder()
df['Domain'] = label_encoder.fit_transform(df['Domain'])
print("\nAfter encoding Domain X:")
print(df['Domain'])

label_encoder = LabelEncoder()
df['Gender'] = label_encoder.fit_transform(df['Gender'])
print("\nAfter encoding Gender X:")# male= 1 and female =0
print(df['Gender'])
print("\n:")# male= 1 and female =0
print(df.head())
df.round(2)
print(df.info())


After encoding Domain X:
0     2
1     2
2     2
3     7
4     7
5     7
6     3
7     3
8     3
9     4
10    4
11    4
12    1
13    1
14    1
15    0
16    0
17    0
18    5
19    5
20    5
21    6
22    6
23    6
Name: Domain, dtype: int32

After encoding Gender X:
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    0
11    0
12    1
13    1
14    1
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    1
Name: Gender, dtype: int32

:
   Gender  Domain  Experience    Niveau  Weighted_Score  Output
0       1       2    0.210526  0.666667        0.392982       1
1       1       2    0.210526  0.666667        0.392982       1
2       1       2    0.105263  0.500000        0.263158       1
3       1       7    0.052632  0.500000        0.231579       1
4       1       7    0.210526  1.000000        0.526316       1
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24 entries, 0 to 23
Data columns (total 6 columns):
 #   Column          

In [12]:
X_data_test = df.drop(['Output', 'Weighted_Score'], axis=1)
y_data_test = df['Output']
X_data_test =X_data_test.round(2)
training_columns = ['Gender', 'Domain', 'Experience', 'Niveau']
X_data_test=X_data_test[training_columns]
print("\nX:")
print(X_data_test[:5])
print("\ny:")
print(y_data_test[:50])
print('\nX shape :',X_data_test.shape)
print('y shape :',y_data_test.shape)


X:
   Gender  Domain  Experience  Niveau
0       1       2        0.21    0.67
1       1       2        0.21    0.67
2       1       2        0.11    0.50
3       1       7        0.05    0.50
4       1       7        0.21    1.00

y:
0     1
1     1
2     1
3     1
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
12    0
13    1
14    0
15    1
16    1
17    1
18    1
19    1
20    1
21    1
22    1
23    0
Name: Output, dtype: int64

X shape : (24, 4)
y shape : (24,)


# Test Data modelling 

Using SVM model because after the comparaison it seemms to be the best one performing on our test set 

In [17]:
## Importing the libraries

from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, auc,precision_score,recall_score
from sklearn.model_selection import cross_val_score
import joblib

In [14]:
model = joblib.load('model.pkl')

# Predict on the test set
y_test_pred = model.predict(X_data_test)

# Evaluate the model's performance
accuracy_score_test=accuracy_score(y_data_test, y_test_pred, normalize=True)

fpr, tpr, thresholds = metrics.roc_curve(y_data_test, y_test_pred, pos_label=1)
auc_test=metrics.auc(fpr, tpr)

roc_auc_test = roc_auc_score(y_data_test, y_test_pred)

confusion_matrix_test=confusion_matrix(y_data_test, y_test_pred)

f1_score_test=f1_score(y_data_test, y_test_pred, average=None)

gini_coefficient2 = 2 * roc_auc_test - 1
precision_test = precision_score(y_data_test, y_test_pred, average=None)
recall_test = recall_score(y_data_test, y_test_pred, average=None)

# Display the results
print(f'Model: ')

print("\n *** Prediction Performance:")
print(f'Accuracy: {accuracy_score_test:.4f}')
print(f'Auc: {auc_test:.4f}')
print(f'Roc: {roc_auc_test:.4f}')
print(f'f1_score: {f1_score_test}')
print(f'f1_score mean: {f1_score_test.mean()}')
print(f'precision_score: {precision_test}')
print(f'precision_score mean: {precision_test.mean()}')
print(f'recall_score: {recall_test}')
print(f'recall_score mean: {recall_test.mean()}')
print(f'Gini Coefficient: {gini_coefficient2:.4f}')
print(f'\nconfusion_matrix: {confusion_matrix_test}')

Model: 

 *** Prediction Performance:
Accuracy: 1.0000
Auc: 1.0000
Roc: 1.0000
f1_score: [1. 1.]
f1_score mean: 1.0
precision_score: [1. 1.]
precision_score mean: 1.0
recall_score: [1. 1.]
recall_score mean: 1.0
Gini Coefficient: 1.0000

confusion_matrix: [[ 3  0]
 [ 0 21]]


In [15]:
# Assuming y_test and y_pred are numpy arrays or lists
y_test_values = y_data_test.tolist()
y_pred_values = y_test_pred.tolist()

# Create a DataFrame to display the values as a table
results_df = pd.DataFrame({'y_test': y_test_values, 'y_pred': y_pred_values})
print(f'Accuracy: {accuracy_score_test:.4f}')
# Display the DataFrame
print(results_df)



Accuracy: 1.0000
    y_test  y_pred
0        1       1
1        1       1
2        1       1
3        1       1
4        1       1
5        1       1
6        1       1
7        1       1
8        1       1
9        1       1
10       1       1
11       1       1
12       0       0
13       1       1
14       0       0
15       1       1
16       1       1
17       1       1
18       1       1
19       1       1
20       1       1
21       1       1
22       1       1
23       0       0


In [20]:
# Perform cross-validation
cv_scores_test = cross_val_score(model, X_data_test, y_data_test, cv=5, scoring='accuracy')

# Display the cross-validation score
print("Cross-Validation Scores test:", cv_scores_test)
print("Mean Accuracy test:", np.mean(cv_scores_test))

Cross-Validation Scores test: [1.  0.8 0.8 0.8 1. ]
Mean Accuracy test: 0.8800000000000001




In [None]:
import joblib
# joblib.dump(model, 'model.pkl')

['model.pkl']