In [None]:


import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from google.cloud import bigquery
from google.colab import auth

# Loading data from Kaggle
url = "https://raw.githubusercontent.com/tranghth-lux/data-science-complete-tutorial/master/Data/HR_comma_sep.csv.txt"
df = pd.read_csv(url)

df = df.rename(columns={
    'sales': 'department',
    'average_montly_hours': 'avg_monthly_hours',
    'Work_accident': 'work_accident',
    'time_spend_company': 'tenure_years'
})

# Create a temp Employee ID' since the public dataset doesn't have one
df['employee_id'] = ['EMP' + str(i).zfill(5) for i in range(1, len(df) + 1)]

#Simulating history vs future

train_df = df.sample(frac=0.95, random_state=42)
pilot_df = df.drop(train_df.index)
X_train = train_df.drop(['left', 'employee_id'], axis=1)
y_train = train_df['left']

#Pipeline
categorical_features = ['department', 'salary']
numerical_features = ['satisfaction_level', 'last_evaluation', 'number_project',
                      'avg_monthly_hours', 'tenure_years', 'work_accident', 'promotion_last_5years']

# Create transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Initialize Model (Random Forest)
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Training model
print("Training Risk Model...")
model.fit(X_train, y_train)
print("Model Training Complete.")

#Predictions
pilot_features = pilot_df.drop(['left', 'employee_id'], axis=1)
probs = model.predict_proba(pilot_features)[:, 1]
dashboard_data = pilot_df.copy()
dashboard_data['churn_risk_score'] = probs
dashboard_data['risk_label'] = np.where(probs > 0.7, 'High Risk',
                               np.where(probs > 0.3, 'Medium Risk', 'Low Risk'))
final_table = dashboard_data[['employee_id', 'department', 'salary', 'churn_risk_score', 'risk_label']]

#Calulating drivers

rf = model.named_steps['classifier']
cat_names = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
all_feature_names = numerical_features + list(cat_names)

importance_df = pd.DataFrame({
    'feature': all_feature_names,
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

print("Connecting to BigQuery...")
auth.authenticate_user()

# CONFIGURATION
project_id = 'kaggle-project-484500'
dataset_id = 'hr_analytics'

def upload_bq(df, table_name):
    client = bigquery.Client(project=project_id)
    table_id = f"{project_id}.{dataset_id}.{table_name}"
    job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
    job = client.load_table_from_dataframe(df, table_id, job_config=job_config)
    job.result()
    print(f"Uploaded {table_name} to BigQuery.")

try:
    upload_bq(final_table, 'employee_risk_predictions')
    upload_bq(importance_df, 'model_drivers')
    print("SUCCESS: Data is live in BigQuery. Ready for Looker Studio.")
except Exception as e:
    print(f"Error uploading to BigQuery: {e}")

Training Risk Model...
Model Training Complete.
Connecting to BigQuery...
Uploaded employee_risk_predictions to BigQuery.
Uploaded model_drivers to BigQuery.
SUCCESS: Data is live in BigQuery. Ready for Looker Studio.
