In [None]:
# Employee Performance and Retention Analysis
# Submision Date : 2025-07-30
# submitted by : Kanishk Sorout

# Phase 1 - Data Collection and EDA
!pip install matplotlib
!pip install pandas
!pip install seaborn
!pip install tensorflow


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report, mean_squared_error, r2_score
from scipy import stats
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Step 1 - Data Collection and Preprocessing
df = pd.read_csv(r"C:\Users\Kanishk\Desktop\assignment\data science\employee_data.csv")
df.drop_duplicates(inplace=True)
df.dropna(inplace=True)

# Step 2 - EDA
print(df.describe())

# Pairplot
sns.pairplot(df.select_dtypes(include=['int64', 'float64']))
plt.show()

# Heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=True, cmap='coolwarm')
plt.show()

# Boxplots for outlier detection
num_cols = ['Age', 'Salary', 'Years_at_Company', 'Performance_Score']
for col in num_cols:
    sns.boxplot(x=df[col])
    plt.title(f'Boxplot for {col}')
    plt.show()

# Step 3 - Probability and Statistical Analysis
# P(Attrition | Department)
attrition_prob = df.groupby('Department')['Attrition'].value_counts(normalize=True).unstack().fillna(0)
print(attrition_prob)

# Bayes' Theorem: P(Attrition|Performance)
performance_bins = pd.cut(df['Performance_Score'], bins=3, labels=['Low', 'Medium', 'High'])
df['Performance_Level'] = performance_bins
bayes_table = pd.crosstab(df['Performance_Level'], df['Attrition'], normalize='index')
print(bayes_table)

# Hypothesis Test: Is mean performance score same across departments?
departments = df['Department'].unique()
performance_by_dept = [df[df['Department'] == dept]['Performance_Score'] for dept in departments]
f_stat, p_value = stats.f_oneway(*performance_by_dept)
print("F-statistic:", f_stat, "P-value:", p_value)

# Phase 2 - Predictive Modeling
# Step 4 - Feature Engineering
scaler = StandardScaler()
df['Salary_scaled'] = scaler.fit_transform(df[['Salary']])
df['Performance_Score_scaled'] = scaler.fit_transform(df[['Performance_Score']])

le = LabelEncoder()
df['Attrition_label'] = le.fit_transform(df['Attrition'])
df['Department_label'] = le.fit_transform(df['Department'])

# Step 5 - Classification Model
features_cls = ['Age', 'Salary_scaled', 'Years_at_Company', 'Performance_Score_scaled', 'Department_label']
target_cls = 'Attrition_label'

X_cls = df[features_cls]
y_cls = df[target_cls]
X_train_cls, X_test_cls, y_train_cls, y_test_cls = train_test_split(X_cls, y_cls, test_size=0.2, random_state=42)

clf = RandomForestClassifier()
clf.fit(X_train_cls, y_train_cls)
y_pred_cls = clf.predict(X_test_cls)

print("Classification Report:")
print(classification_report(y_test_cls, y_pred_cls))
sns.heatmap(confusion_matrix(y_test_cls, y_pred_cls), annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.show()

# Step 6 - Regression Model
features_reg = ['Age', 'Salary_scaled', 'Years_at_Company', 'Department_label']
target_reg = 'Performance_Score'

X_reg = df[features_reg]
y_reg = df[target_reg]
X_train_reg, X_test_reg, y_train_reg, y_test_reg = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)

reg = LinearRegression()
reg.fit(X_train_reg, y_train_reg)
y_pred_reg = reg.predict(X_test_reg)

print("R-squared:", r2_score(y_test_reg, y_pred_reg))
print("MSE:", mean_squared_error(y_test_reg, y_pred_reg))
plt.scatter(y_test_reg, y_pred_reg)
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Actual vs Predicted Performance Score")
plt.show()

# Phase 3 - Deep Learning
# Step 7 - Neural Network for Performance Prediction
X_dl_reg = df[features_reg].values
y_dl_reg = df['Performance_Score'].values

model_reg = Sequential()
model_reg.add(Dense(16, activation='relu', input_shape=(X_dl_reg.shape[1],)))
model_reg.add(Dense(8, activation='relu'))
model_reg.add(Dense(1))
model_reg.compile(optimizer='adam', loss='mse')
model_reg.fit(X_dl_reg, y_dl_reg, epochs=100, batch_size=16, verbose=0)

# Step 8 - Neural Network for Attrition Classification
X_dl_cls = df[features_cls].values
y_dl_cls = df['Attrition_label'].values

model_cls = Sequential()
model_cls.add(Dense(16, activation='relu', input_shape=(X_dl_cls.shape[1],)))
model_cls.add(Dense(8, activation='relu'))
model_cls.add(Dense(1, activation='sigmoid'))
model_cls.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_cls.fit(X_dl_cls, y_dl_cls, epochs=100, batch_size=16, verbose=0)

# Evaluate deep learning classification model
loss, accuracy = model_cls.evaluate(X_dl_cls, y_dl_cls, verbose=0)
print("DL Classification Accuracy:", accuracy)

# Phase 4 - Reporting and Insights
# Step 9 - Summary
print("Key Insights:")
print("- Departments with high attrition:")
print(df[df['Attrition'] == 'Yes']['Department'].value_counts())
print("- High performance tends to correlate with lower attrition.")

# Step 10 - Visualizations
# Attrition by Department
sns.countplot(x='Department', hue='Attrition', data=df)
plt.title("Attrition by Department")
plt.xticks(rotation=45)
plt.show()

# Salary vs Performance
sns.scatterplot(x='Salary', y='Performance_Score', hue='Attrition', data=df)
plt.title("Salary vs Performance Score")
plt.show()

# Line plot: Average performance by years
avg_perf = df.groupby('Years_at_Company')['Performance_Score'].mean()
plt.plot(avg_perf)
plt.title("Average Performance by Years at Company")
plt.xlabel("Years at Company")
plt.ylabel("Avg Performance Score")
plt.show()


Collecting tensorflow
  Downloading tensorflow-2.19.0-cp310-cp310-win_amd64.whl.metadata (4.1 kB)
Downloading tensorflow-2.19.0-cp310-cp310-win_amd64.whl (375.7 MB)
   ---------------------------------------- 0.0/375.7 MB ? eta -:--:--
   ---------------------------------------- 1.0/375.7 MB 6.3 MB/s eta 0:01:00
   ---------------------------------------- 1.8/375.7 MB 4.8 MB/s eta 0:01:18
   ---------------------------------------- 2.6/375.7 MB 4.4 MB/s eta 0:01:24
   ---------------------------------------- 3.7/375.7 MB 4.3 MB/s eta 0:01:28
   ---------------------------------------- 4.5/375.7 MB 4.3 MB/s eta 0:01:28
    --------------------------------------- 5.2/375.7 MB 4.2 MB/s eta 0:01:29
    --------------------------------------- 5.8/375.7 MB 4.1 MB/s eta 0:01:30
    --------------------------------------- 6.8/375.7 MB 4.1 MB/s eta 0:01:30
    --------------------------------------- 7.9/375.7 MB 4.1 MB/s eta 0:01:30
    --------------------------------------- 8.7/375.7 MB 4.1 M

ERROR: Could not install packages due to an OSError: [Errno 2] No such file or directory: 'C:\\Users\\Kanishk\\AppData\\Local\\Packages\\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\\LocalCache\\local-packages\\Python310\\site-packages\\tensorflow\\include\\external\\com_github_grpc_grpc\\src\\core\\ext\\filters\\client_channel\\lb_policy\\grpclb\\client_load_reporting_filter.h'



ModuleNotFoundError: No module named 'tensorflow.python'