In [29]:
import pandas as pd

file_path = 'Aug Train.csv'
train_df = pd.read_csv(file_path)

# Überblick der Trainingsdaten
train_df.head()


Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
0,0.624,Male,No relevent experience,no_enrollment,High School,,5,,never,21,0
1,0.926,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,,>4,12,0
2,0.92,Male,Has relevent experience,no_enrollment,Graduate,STEM,>20,Public Sector,>4,26,0
3,0.624,Male,No relevent experience,Full time course,High School,,1,,never,30,1
4,0.92,Female,Has relevent experience,no_enrollment,Masters,STEM,>20,,>4,46,0


In [30]:
file_path = 'Aug Test.csv'
test_df = pd.read_csv(file_path)

# Überblick der Test Daten
test_df.head()

Unnamed: 0,city_development_index,gender,relevent_experience,enrolled_university,education_level,major_discipline,experience,company_type,last_new_job,training_hours,target
0,0.624,,Has relevent experience,Full time course,Graduate,Other,3,Pvt Ltd,1,134,0
1,0.92,Female,No relevent experience,no_enrollment,Graduate,STEM,5,Early Stage Startup,1,34,1
2,0.767,,Has relevent experience,Full time course,Graduate,STEM,10,Pvt Ltd,2,90,0
3,0.91,Male,No relevent experience,,High School,,10,,never,42,0
4,0.624,Male,Has relevent experience,Part time course,Graduate,STEM,3,Pvt Ltd,1,198,0


In [31]:
# Weitere Meta informationen
train_df.info(
)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2100 entries, 0 to 2099
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   city_development_index  2100 non-null   float64
 1   gender                  1585 non-null   object 
 2   relevent_experience     2100 non-null   object 
 3   enrolled_university     2051 non-null   object 
 4   education_level         2049 non-null   object 
 5   major_discipline        1768 non-null   object 
 6   experience              2090 non-null   object 
 7   company_type            1415 non-null   object 
 8   last_new_job            2048 non-null   object 
 9   training_hours          2100 non-null   int64  
 10  target                  2100 non-null   int64  
dtypes: float64(1), int64(2), object(8)
memory usage: 180.6+ KB


In [32]:
# Statistischer Überblick
train_df.describe()

Unnamed: 0,city_development_index,training_hours,target
count,2100.0,2100.0,2100.0
mean,0.826898,65.89619,0.254762
std,0.124464,58.432483,0.435831
min,0.448,1.0,0.0
25%,0.72925,24.0,0.0
50%,0.899,49.0,0.0
75%,0.92,89.25,1.0
max,0.949,336.0,1.0


In [33]:
# Task 1.1
#  experience bearbeiten
train_df['experience'] = train_df['experience'].replace({'>20': '21', '<1': '1'}).astype(float)
test_df['experience'] = test_df['experience'].replace({'>20': '21', '<1': '1'}).astype(float)

# Task 1.2
# last_new_job bearbeiten
train_df['last_new_job'] = train_df['last_new_job'].replace({'>4': '5', 'never': '0'}).astype(float)
test_df['last_new_job'] = test_df['last_new_job'].replace({'>4': '5', 'never': '0'}).astype(float)

# Task 1.3
# Kategorische Spalten finden
categorical_cols = train_df.select_dtypes(include=['object']).columns

# Fehlende Werte in kategorischen Spalten durch den häufigsten Wert ersetzen
for col in categorical_cols:
    mode = train_df[col].mode()[0]
    train_df[col] = train_df[col].fillna(mode)
    test_df[col] = test_df[col].fillna(mode)

# Numerische Spalten finden
numerical_cols = train_df.select_dtypes(include=['number']).columns

# Fehlende Werte in numerischen Spalten durch den Median ersetzen
for col in numerical_cols:
    median = train_df[col].median()
    train_df[col] = train_df[col].fillna(median)
    test_df[col] = test_df[col].fillna(median)


In [34]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

# Features und Zielvariable trennen
X_train = train_df.drop(columns=['target'])
y_train = train_df['target']
X_test = test_df.drop(columns=['target'])
y_test = test_df['target']

# One-Hot-Encoding der kategorischen Variablen
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train.select_dtypes(include=['object']))
X_test_encoded = encoder.transform(X_test.select_dtypes(include=['object']))

# Numerische Daten beibehalten
X_train_numeric = X_train.select_dtypes(include=['number']).to_numpy()
X_test_numeric = X_test.select_dtypes(include=['number']).to_numpy()

# Kategorische und numerische Daten zusammenführen
X_train_final = np.hstack((X_train_numeric, X_train_encoded))
X_test_final = np.hstack((X_test_numeric, X_test_encoded)) # type: ignore  # noqa: F821


# SMOTE anwenden nur zum Ausprobieren. Modell performt auf Traingsdaten besser, auf Testdaten schlechter
# smote = SMOTE(random_state=42)
# X_train_final, y_train = smote.fit_resample(X_train_final, y_train)


# Modell erstellen und trainieren. RadomForest
model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
model.fit(X_train_final, y_train)

# Vorhersagen auf Trainingsdaten
y_train_pred = model.predict(X_train_final)

# Evaluierung des Modells auf Trainingsdaten
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

train_results = {
    'Confusion Matrix': train_conf_matrix,
    'Accuracy': train_accuracy,
    'Precision': train_precision,
    'Recall': train_recall,
    'F1 Score': train_f1
}

train_results

# Results without Smote
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

# Features und Zielvariable trennen
X_train = train_df.drop(columns=['target'])
y_train = train_df['target']
X_test = test_df.drop(columns=['target'])
y_test = test_df['target']

# One-Hot-Encoding der kategorischen Variablen
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train.select_dtypes(include=['object']))
X_test_encoded = encoder.transform(X_test.select_dtypes(include=['object']))

# Numerische Daten beibehalten
X_train_numeric = X_train.select_dtypes(include=['number']).to_numpy()
X_test_numeric = X_test.select_dtypes(include=['number']).to_numpy()

# Kategorische und numerische Daten zusammenführen
X_train_final = np.hstack((X_train_numeric, X_train_encoded))
X_test_final = np.hstack((X_test_numeric, X_test_encoded)) # type: ignore  # noqa: F821


# SMOTE anwenden nur zum Testen. Modell performt auf Traingsdaten besser, auf Testdaten schlechter
# smote = SMOTE(random_state=42)
# X_train_final, y_train = smote.fit_resample(X_train_final, y_train)


# Modell erstellen und trainieren. RadomForest
model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
model.fit(X_train_final, y_train)

# Vorhersagen auf Trainingsdaten
y_train_pred = model.predict(X_train_final)

# Evaluierung des Modells auf Trainingsdaten
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

train_results = {
    'Confusion Matrix': train_conf_matrix,
    'Accuracy': train_accuracy,
    'Precision': train_precision,
    'Recall': train_recall,
    'F1 Score': train_f1
}

train_results

# Results without Smote
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from imblearn.over_sampling import SMOTE

# Features und Zielvariable trennen
X_train = train_df.drop(columns=['target'])
y_train = train_df['target']
X_test = test_df.drop(columns=['target'])
y_test = test_df['target']

# One-Hot-Encoding der kategorischen Variablen
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_train_encoded = encoder.fit_transform(X_train.select_dtypes(include=['object']))
X_test_encoded = encoder.transform(X_test.select_dtypes(include=['object']))

# Numerische Daten beibehalten
X_train_numeric = X_train.select_dtypes(include=['number']).to_numpy()
X_test_numeric = X_test.select_dtypes(include=['number']).to_numpy()

# Kategorische und numerische Daten zusammenführen
X_train_final = np.hstack((X_train_numeric, X_train_encoded))
X_test_final = np.hstack((X_test_numeric, X_test_encoded)) # type: ignore  # noqa: F821


# SMOTE anwenden nur zum Testen. Modell performt auf Traingsdaten besser, auf Testdaten schlechter
# smote = SMOTE(random_state=42)
# X_train_final, y_train = smote.fit_resample(X_train_final, y_train)


# Modell erstellen und trainieren. RadomForest
model = RandomForestClassifier(random_state=42, n_estimators=100, max_depth=10)
model.fit(X_train_final, y_train)

# Vorhersagen auf Trainingsdaten
y_train_pred = model.predict(X_train_final)

# Evaluierung des Modells auf Trainingsdaten
train_conf_matrix = confusion_matrix(y_train, y_train_pred)
train_accuracy = accuracy_score(y_train, y_train_pred)
train_precision = precision_score(y_train, y_train_pred)
train_recall = recall_score(y_train, y_train_pred)
train_f1 = f1_score(y_train, y_train_pred)

train_results = {
    'Confusion Matrix': train_conf_matrix,
    'Accuracy': train_accuracy,
    'Precision': train_precision,
    'Recall': train_recall,
    'F1 Score': train_f1
}

train_results

# Results with Smote
# {'Confusion Matrix': array([[1408,  157],
#         [ 198, 1367]]),
#  'Accuracy': 0.8865814696485623,
#  'Precision': np.float64(0.8969816272965879),
#  'Recall': np.float64(0.873482428115016),
#  'F1 Score': np.float64(0.8850760764001295)}

{'Confusion Matrix': array([[1408,  157],
        [ 198, 1367]]),
 'Accuracy': 0.8865814696485623,
 'Precision': np.float64(0.8969816272965879),
 'Recall': np.float64(0.873482428115016),
 'F1 Score': np.float64(0.8850760764001295)}

In [35]:
# Vorhersagen auf Testdaten
y_test_pred = model.predict(X_test_final)

# Evaluierung des Modells auf Testdaten
test_conf_matrix = confusion_matrix(y_test, y_test_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)
test_precision = precision_score(y_test, y_test_pred)
test_recall = recall_score(y_test, y_test_pred)
test_f1 = f1_score(y_test, y_test_pred)

test_results = {
    'Confusion Matrix': test_conf_matrix,
    'Accuracy': test_accuracy,
    'Precision': test_precision,
    'Recall': test_recall,
    'F1 Score': test_f1
}

test_results

# # Results with Smote
# {'Confusion Matrix': array([[67, 11],
#         [13,  9]]),
#  'Accuracy': 0.76,
#  'Precision': np.float64(0.45),
#  'Recall': np.float64(0.4090909090909091),
#  'F1 Score': np.float64(0.42857142857142855)}


{'Confusion Matrix': array([[67, 11],
        [13,  9]]),
 'Accuracy': 0.76,
 'Precision': np.float64(0.45),
 'Recall': np.float64(0.4090909090909091),
 'F1 Score': np.float64(0.42857142857142855)}

Vergleich der Ergebnisse:

Das Modell hat hohe Accuracy im Traings- und Test Set (beide 80%-90%). Dies ist prinzipiell gut allerdings irreführend, da unausgewogene Daten vorliegen.

Das Modell hat gute Precion in den Trainingsdaten (84,8%), und etwas weniger gute Precision in den Testdaten (70%). Anscheinend gibt es in den Trainingsdaten mehr false Positives

Das Modell das recht unbefriedigende Recall Werte in den Trainingsdaten (52%) und Testdaten (31%). Hierdurch entstehen viele False Negatives für beide Datensätze

Das Modell hat ferner auch einen unbefriedigenden F1 Score in den Trainingsdaten (64,5%) und einen niedrigeren F1 Score in den Testdaten (44%). Dieser ist mit den schlechten  und Recall Werten besonders im Trainings Set zu erklären.

Das Modell ist verwendbar besonders mit seinen hohen Accuracy- und Precision Werten im Trainings Set und wenig guter Precision für das Test Set. Somit wäre es ausreichend nutzbar um False Positives im Einsatz zu vermeiden. Generiert jedoch signifikant viele False Negatives in beiden Datensätzen

Interpretation:
Das Modell würde viele Mitarbeiter als nicht-suchend klassifizieren, obwohl Mitarbeiter tatsächlich nach einem Job suchen. Somit wäre dieses Modell nicht dafür geeignet, um zu bewerten, ob man Mitarbeiter halten kann oder nicht.

Verbesserungsvorschläge:

- Das Modell könnte durch Methoden wie SMOTE oder ADASYN verbessert werden, um die Daten auszugleichen

- Durch Feature Engineering könnten neue Features erstellt oder in Verbindung gebracht werden, um die Vorhersage zu verbessern.

- Hyperparameter des Modells könnten in Experimenten optimiert werden, um die Vorhersage zu verbessern

- Durch Cross-Validation könnte man die Generalisierung des Modells steigern, indem das das Modell itarativ auf verschiedenen Teilen des Datensatzes trainiert wird.
