In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
nitindig_absenteeism_at_work_data_set_path = kagglehub.dataset_download('nitindig/absenteeism-at-work-data-set')

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

### Importing Libraries for Data Visualization

We will be using the following libraries:
- `matplotlib`: A library for creating static, animated, and interactive visualizations.
- `seaborn`: A statistical data visualization library built on top of `matplotlib`.
- `pandas`: A powerful data manipulation and analysis library.

We also set the notebook to display inline plots and apply a `whitegrid` style to `seaborn` for better visual aesthetics.

```python
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
sns.set_style('whitegrid')


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
%matplotlib inline
sns.set_style('whitegrid')

### Loading the Dataset

We are loading the "Absenteeism at Work" dataset using the `pandas` library. The dataset is in CSV format and is being loaded with the following parameters:
- `encoding='utf-8'`: Specifies the character encoding used in the file.
- `sep=';'`: Defines the separator used in the CSV file (in this case, a semicolon).

```python
data = pd.read_csv('/kaggle/input/absenteeism-at-work-data-set/Absenteeism_at_work.csv', encoding='utf-8', sep=';')


In [None]:
data = pd.read_csv('/kaggle/input/absenteeism-at-work-data-set/Absenteeism_at_work.csv', encoding='utf-8', sep=';')

### Extracting Numeric Columns

We use the `data._get_numeric_data()` function to filter out the numeric columns from the dataset. This function returns a DataFrame that contains only the columns with numeric data types.

The `.columns` attribute is then used to display the names of these numeric columns.

```python
data._get_numeric_data().columns


In [None]:
data._get_numeric_data().columns

### Sorting the Data by 'ID' Column

We use the `sort_values()` function to sort the data by the `'ID'` column in ascending order. This helps us arrange the data in a way that is easier to analyze or visualize.

```python
sorted = data.sort_values(by='ID', ascending=True)


In [None]:
sorted=data.sort_values(by='ID', ascending=True)

### Assigning the DataFrame to a New Variable

In this step, we assign the original `data` DataFrame (or optionally the `sorted` DataFrame if sorting was done) to the variable `df`. This allows us to continue working with the DataFrame under the new variable name.

```python
df = data


In [None]:
# df = sorted
df = data

### Generating a Profiling Report

In this step, we use the `ydata_profiling` library to generate a profiling report of the `df` DataFrame. The `ProfileReport` function provides an extensive overview of the dataset, including summaries of each feature, correlations, missing values, and distributions.

```python
from ydata_profiling import ProfileReport

# Create the profiling report
profile = ProfileReport(df, title="Profiling Report")

# Display the report
profile


In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile

### Dropping Columns and Analyzing Correlation

In this step, we remove the `ID` and `Seasons` columns from the dataset, as they may not be relevant for the analysis. Afterward, we compute the correlation between the remaining features and the target variable, `Absenteeism time in hours`.

```python
# Drop unnecessary columns
df = df.drop(columns=['ID', 'Seasons'])

# Calculate and print the correlation with the target variable
print(df.corr()['Absenteeism time in hours'])


In [None]:
df=df.drop(columns=['ID', 'Seasons'])
print(df.corr()['Absenteeism time in hours'])


### Defining Features and Target Variables

In this step, we define the target variable `y` and the feature matrix `X`. The target variable is the `Absenteeism time in hours` column, while the feature matrix `X` consists of all other columns in the dataset.

```python
# Define the target variable 'y' and feature matrix 'X'
y = df['Absenteeism time in hours']
X = df.drop('Absenteeism time in hours', axis=1)

# Display target variable 'y'
y


In [None]:
y=df['Absenteeism time in hours']
X=df.drop('Absenteeism time in hours', axis=1)


y

### Importing the StandardScaler

The `StandardScaler` from `sklearn.preprocessing` is used to standardize features by removing the mean and scaling to unit variance. It ensures that each feature in the dataset has a mean of 0 and a standard deviation of 1, which is important for many machine learning algorithms that rely on distance metrics, such as linear regression or k-nearest neighbors.

```python
# Importing StandardScaler
from sklearn.preprocessing import StandardScaler


In [None]:
from sklearn.preprocessing import StandardScaler

### Initializing the StandardScaler

We initialize the `StandardScaler` to standardize the features in the dataset. The `fit()` method will compute the mean and standard deviation for each feature, while `transform()` will standardize the data accordingly.

```python
# Initializing StandardScaler
scaler = StandardScaler()


In [None]:
scaler=StandardScaler()

### Scaling the Numeric Features

After initializing the `StandardScaler`, we apply it to the numeric features of the dataset. We select the numeric features using `df._get_numeric_data().columns`, and then use `scaler.fit_transform()` to standardize them.

```python
# Scaling the numeric features
num_feat = [feat for feat in df._get_numeric_data().columns]
df[num_feat] = scaler.fit_transform(df[num_feat])


In [None]:

num_feat=[feat for feat in df._get_numeric_data().columns]
df[num_feat] = scaler.fit_transform(df[num_feat])

### Splitting the Data into Training and Testing Sets

We use the `train_test_split` function from `sklearn.model_selection` to split the data into training and testing sets. 20% of the data is used for testing, and the rest is used for training. A random state of `42` ensures that the split is reproducible.

```python
from sklearn.model_selection import train_test_split

# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Feature Selection with Recursive Feature Elimination (RFE)

In this step, we use Recursive Feature Elimination (RFE) to select the top features that contribute the most to the prediction. RFE works by recursively removing features and building a model on the remaining attributes. The process continues until the specified number of features is selected.

We use a `LinearRegression` model as the estimator for RFE. Here, we select the top 10 features based on their contribution to predicting the target variable, `Absenteeism time in hours`.

```python
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Initializing the model
model = LinearRegression()

# Applying RFE for feature selection
selector = RFE(model, n_features_to_select=10)
selector.fit(X, y)

# Getting the selected features
selected_features = X.columns[selector.support_]

# Selecting the features for the final model
X_selected = X[selected_features]

# Fitting the model with the selected features
model.fit(X_train, y_train)

# Printing the intercept of the model
print(model.intercept_)


In [None]:
# from sklearn.feature_selection import RFE
# from sklearn.linear_model import LinearRegression

# model = LinearRegression()
# selector = RFE(model, n_features_to_select=10)
# selector.fit(X, y)
# selected_features = X.columns[selector.support_]

# X_selected=X[selected_features]
# model.fit(X_train, y_train)
# print(model.intercept_)

### Polynomial Regression

Polynomial regression is a type of regression model that models the relationship between the independent variables (features) and the dependent variable (target) as an nth degree polynomial. This approach can help capture the non-linear relationships between the features and target variable.

In this step, we use the `PolynomialFeatures` class from `sklearn` to generate polynomial features of degree 2. These new features will be used to fit a linear regression model, allowing the model to better capture non-linear patterns in the data.

```python
from sklearn.preprocessing import PolynomialFeatures

# Creating polynomial features of degree 2
poly = PolynomialFeatures(degree=2)

# Transforming the features into polynomial features
X_poly = poly.fit_transform(X_train)

# Fitting the linear regression model with polynomial features
model.fit(X_poly, y_train)

# Making predictions on the test set using the transformed features
y_pred = model.predict(poly.transform(X_test))


In [None]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(degree=2)
X_poly = poly.fit_transform(X_train)
model.fit(X_poly, y_train)
y_pred = model.predict(poly.transform(X_test))


### Decision Tree Regression

A Decision Tree regressor is a non-linear model that splits the data into different segments based on feature values. It can be used to capture complex relationships between the features and the target variable.

In this step, we use the `DecisionTreeRegressor` class from `sklearn` to create a decision tree model. By setting the `max_depth` parameter, we control the depth of the tree, which helps prevent overfitting.

```python
from sklearn.tree import DecisionTreeRegressor

# Initializing the Decision Tree Regressor with max_depth=5 to prevent overfitting
tree = DecisionTreeRegressor(max_depth=5)

# Training the model on the training data
tree.fit(X_train, y_train)

# Making predictions on the test set
y_pred = tree.predict(X_test)


In [None]:
from sklearn.tree import DecisionTreeRegressor

tree = DecisionTreeRegressor(max_depth=5)
tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)


### Hyperparameter Tuning with Random Forest Regressor

In machine learning, hyperparameter tuning is the process of selecting the best hyperparameters for a given model. In this case, we are tuning the hyperparameters of a **Random Forest Regressor** using **GridSearchCV**.

1. **RandomForestRegressor**: This is an ensemble learning method that creates multiple decision trees and combines their predictions for a more accurate model.

2. **GridSearchCV**: This technique performs an exhaustive search over a specified parameter grid to find the best combination of hyperparameters. It evaluates each combination using cross-validation and chooses the one that minimizes the error (based on the scoring metric).

### Code Explanation:

```python
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Initialize the Random Forest Regressor model
rf = RandomForestRegressor(random_state=42)

# Define a hyperparameter grid to search over
param_grid = {
    'n_estimators': [100, 200, 300],  # Number of trees in the forest
    'max_depth': [10, 20, None],  # Maximum depth of the trees
    'min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'min_samples_leaf': [1, 2, 4],  # Minimum samples required to be at a leaf node
    'max_features


In [None]:
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.model_selection import GridSearchCV

# # Initialize the model
# rf = RandomForestRegressor(random_state=42)

# # Define the hyperparameter grid
# param_grid = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [10, 20, None],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4],
#     'max_features': ['sqrt', 'log2', None]
# }

# # Perform grid search with cross-validation
# grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train)

# # Best parameters and model
# print("Best Parameters:", grid_search.best_params_)
# best_rf = grid_search.best_estimator_


### Random Forest Regressor Model

In this section, we are using the **Random Forest Regressor** to make predictions on the dataset. The Random Forest is an ensemble learning method that uses multiple decision trees to improve the model's predictive accuracy.

### Code Explanation:

```python
from sklearn.ensemble import RandomForestRegressor

# Initialize the Random Forest Regressor with specific hyperparameters
forest = RandomForestRegressor(
    n_estimators=300,          # Number of trees in the forest (300 trees)
    max_depth=10,              # Maximum depth of each tree (limits tree growth to prevent overfitting)
    max_features='sqrt',       # The number of features to consider for each split (square root of the total features)
    min_samples_split=10,      # The minimum number of samples required to split an internal node
    min_samples_leaf=1,        # The minimum number of samples required to be at a leaf node
    random_state=42            # Ensures reproducibility by setting the seed for random number generation
)

# Fit the model to the training data
forest.fit(X_train, y_train)

# Use the trained model to predict on the test set
y_pred = forest.predict(X_test)


In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=300, max_depth=10, max_features='sqrt', min_samples_split=10, min_samples_leaf=1, random_state=42)
forest.fit(X_train, y_train)
y_pred = forest.predict(X_test)


In [None]:
y_pred

### Model Evaluation

We evaluated the **Random Forest Regressor** model's performance using two key metrics:

1. **Mean Squared Error (MSE)**: Measures the average squared difference between the predicted values and actual values. A lower MSE indicates a better fit.

2. **R-squared (R²)**: Represents the proportion of variance in the target variable that is predictable from the features. A higher R² value indicates a better model fit.

The following code was used for evaluation:

```python
from sklearn.metrics import mean_squared_error, r2_score

# Calculate Mean Squared Error
mse = mean_squared_error(y_test, y_pred)
# Calculate R-squared
r2 = r2_score(y_test, y_pred)

mse  # Output MSE
# r2  # Output R-squared


In [None]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mse
# r2

In [None]:
print(y_pred)

In [None]:
print(y_test)

# Save the Random Forest Model

We will save the trained Random Forest model to a file using the `joblib` library. This allows us to reload and use the model later without retraining it.

```python
import joblib

# Save the Random Forest model to a file
joblib.dump(forest, 'random_forest_model.pkl')


In [None]:
import joblib
joblib.dump(forest, 'random_forest_model.pkl')

# Generate `requirements.txt`

To create a `requirements.txt` file containing all the installed packages in the current environment, use the following command:

```python
!pip freeze > requirements.txt


In [None]:
!pip freeze > requirements.txt