<a href="https://colab.research.google.com/github/itsmeprabha75/Wind-Turbine-EDA_Project/blob/main/Wind_Turbine_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Wind Turbine Failure Prediction - Exploratory Data Analysis
==========================================================
Author: B.AravindTeja
Date: 21.9.2025
Objective: Analyze wind turbine operational data to identify patterns
          and factors contributing to turbine failures

Dataset: Wind turbine operational data with features like1 Wind_speed,power,   temperature etc..


# Install & import all required libraries

# Data manipulation and analysis using DataFrames

In [None]:
import pandas as pd

# Static plotting library for basic visualizations

In [None]:
import matplotlib.pyplot as plt

 # Statistical data visualization with enhanced aesthetics

In [None]:
import seaborn as sns

# Interactive visualizations for exploratory data analysis

In [None]:
import plotly.express as px

 # Numerical computing, handling arrays, and mathematical operations

In [None]:
import numpy as np

# High-level API for interactive web-based plotting

In [None]:
import bokeh.plotting as bp

# Custom models and widgets for enhancing Bokeh plots

In [None]:
import bokeh.models as bm

# Load Dataset

In [None]:
df = pd.read_csv('/content/Wind_Turbine_2025.csv')

# Display information about dataset

In [None]:
df.info()

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
df.describe()

In [None]:
df.dtypes

In [None]:
df['date'] = df['date'].astype('datetime64[ns]')

In [None]:
df.dtypes

In [None]:
df['Year'] = df['date'].dt.year

In [None]:
numerical_columns = df.select_dtypes(include = [np.number]).columns.tolist()

In [None]:
numerical_columns

In [None]:
numerical_columns.remove('Year')

In [None]:
numerical_columns

In [None]:
categorical_columns = df.select_dtypes(exclude=np.number).columns.tolist()

In [None]:
categorical_columns

In [None]:
categorical_columns.remove('date')

In [None]:
categorical_columns

# Missing Values Analysis

In [None]:
missing_values = df.isnull().sum()

In [None]:
missing_values

In [None]:
plt.figure(figsize = (12, 6))
sns.heatmap(df.isnull(), cbar = False, cmap = 'viridis',  xticklabels = True)
plt.xticks(rotation = 45, ha = 'right')
plt.title("Missing Values Heatmap")
plt.show()

# Business Moments - Univariate

In [None]:
for col in numerical_columns:
    print(f"\\nStatistics for {col}:")
    print(f"Mean : {df[col].mean()}")
    print(f"Median : {df[col].median()}")
    print(f"Mode : {df[col].mode()[0]}")
    print(f"Variance : {df[col].var()}")
    print(f"Standard Deviation : {df[col].std()}")
    print(f"Range : {df[col].max() - df[col].min()}")
    print(f"Skewness : {df[col].skew()}")
    print(f"Kurtosis : {df[col].kurt()}")

# Univariate Analysis & Outlier Detection

# Histograms in One Window (Two Panes)
# Calculate the number of rows needed for subplots

In [None]:
n_rows = (len(numerical_columns) + 1) // 2

In [None]:
n_rows

In [None]:
fig, axes = plt.subplots(n_rows, 2, figsize = (12, n_rows * 4))
axes = axes.flatten()
#Loop through numerical columns to plot histograms
for i, col in enumerate(numerical_columns):
  sns.histplot(data=df, x=col,kde=True, ax= axes[i])
  axes[i].set_xlabel(col, fontsize = 10)
  axes[i].tick_params(axis = 'x', rotation = 45)
#Adjust layout to prevent overlap
plt.tight_layout()
plt.show()


# Boxplots in Another Window

In [None]:
n_rows = (len(numerical_columns) + 1) // 2
# Create a figure with subplots arranged in two columns
fig, axes = plt.subplots(n_rows, 2, figsize = (12, n_rows * 4))
axes = axes.flatten()  # Flatten axes to make iteration easier
# Loop through numerical columns to plot boxplots
for i, col in enumerate(numerical_columns):
    sns.boxplot(x = df[col], ax = axes[i])  # Create boxplot
    axes[i].set_xlabel(col, fontsize = 10)  # Set x-axis label to column name
    axes[i].tick_params(axis = 'x', rotation = 45)  # Rotate x-axis labels for better readability
# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

In [None]:
for i, col in enumerate(numerical_columns):
    # Highlighting outliers
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
    print(f"Outliers detected in {col}:\n", outliers[col])
    plt.show()

In [None]:
for col in categorical_columns:
    plt.figure(figsize = (6, 4))
    sns.countplot(x = df[col])
    plt.title(f"Count of {col}")
    plt.show()

# Plotly Pie Chart

In [None]:
fig = px.pie(df, names = 'Failure_status', title = 'Pie Chart of Failure Status', hole = 0.3) # hole is size of center hole. Creates a donut chart with a 30% hole in the center.
fig.show(renderer = "colab")

# Class imbalance bar plot using Bokeh

In [None]:
p = bp.figure(x_range = list(df['Failure_status'].unique()), title = "Class Imbalance in Failure Status",
              toolbar_location = "below", tools = "zoom_in, zoom_out") # tools specifies which tools are included in the toolbar.
p.vbar(x = df['Failure_status'].unique(), top = df['Failure_status'].value_counts().values, width = 0.5) #vbar is vertical bar
bp.show(p)

In [None]:
p

# Bivariate Analysis (Both numeric, One numeric and one categorical)

In [None]:
# Create scatter plot
plt.figure(figsize = (8, 6))  # Set figure size
plt.scatter(df['Wind_speed'], df['Power'], alpha = 0.5)  # Scatter plot with transparency
plt.xlabel('Wind Speed')  # Label for x-axis
plt.ylabel('Power')  # Label for y-axis
plt.title('Scatter Plot of Wind Speed vs Power')  # Set title
plt.grid(True)  # Enable grid for better readability

In [None]:
# Create an interactive scatter plot using Plotly
scatter = px.scatter(df,
                     x = 'Wind_speed',
                     y = 'Power',
                     color = 'Failure_status',
                     size_max = 60,
                     title = 'Scatter Plot of Wind Speed vs Power')
# Show the plot
scatter.show(renderer = "colab")

In [None]:
sns.boxplot(x = df["Failure_status"], y = df["Generator_speed"])
plt.title("Failure Status vs Generator Speed")
plt.show()

In [None]:
sns.violinplot(x = df["Failure_status"], y = df["Power"])
plt.title("Failure Status vs Power Distribution")
plt.show()


# Multivariate Analysis

In [None]:
# Pairplot
sns.pairplot(df[numerical_columns], diag_kind = "kde")
plt.suptitle("Pairplot - Multivariate Analysis")
plt.show()


In [None]:
# Correlation Heatmap
sns.heatmap(df[numerical_columns].corr(), annot = True, cmap = 'coolwarm')
plt.title("Correlation Heatmap")
plt.show()

# Time Series Analysis

In [None]:
fig = px.line(df, x = 'date', y = 'Power', title = 'Power Generation Over Time', color = 'Failure_status')
fig.show(renderer = "colab")

In [None]:
fig = px.line(df, x = 'date', y = 'Wind_speed', title = 'Wind Speed Over Time')
fig.show(renderer = "colab")

In [None]:
sns.lineplot(x = df['date'], y = df['Power'], hue = df['Failure_status'])
plt.title("Power Generation Trends by Failure Status")
plt.xticks(rotation = 45)
plt.show()

# Line Plots for All Numerical Columns in One Window

In [None]:
n_rows = (len(numerical_columns) + 1) // 2  # Determine number of rows for subplots

# Create a figure with subplots arranged in two columns
fig, axes = plt.subplots(n_rows, 2, figsize = (12, n_rows * 4))
axes = axes.flatten()

# Loop through numerical columns to plot line charts
for i, col in enumerate(numerical_columns):
    sns.lineplot(data = df, x = 'date', y = col, hue = 'Failure_status', ax = axes[i])
    axes[i].set_xlabel("Date", fontsize = 10)
    axes[i].set_ylabel(col, fontsize = 10)
    axes[i].tick_params(axis = 'x', rotation = 45)
    axes[i].set_title(f"{col} Trends by Failure Status")

# Adjust layout to prevent overlap
plt.tight_layout()
plt.show()

# Yearly Data Overlay for All Numerical Columns

In [None]:
for col in numerical_columns:
    plt.figure(figsize = (10, 6))
    sns.lineplot(data = df, x = df['date'].dt.day_of_year, y = col, hue = df['Year'], palette = 'tab10')
    plt.xlabel("Day of Year")
    plt.ylabel(col)
    plt.title(f"Yearly Trend of {col}")
    plt.legend(title = "Year")
    plt.grid(True)
    plt.show()

In [None]:
# Lag Plot
pd.plotting.lag_plot(df['Power'])
plt.title("Lag Plot for Power")
plt.show()

In [None]:
# Groupby Aggregation
grouped_df = df.groupby('Failure_status')[numerical_columns].mean()
print(grouped_df)

#AutoEDA


In [None]:
# pip install dtale   # In case of any error then please install werkzeug appropriate version (pip install werkzeug==2.0.3)
!pip install dtale
import dtale
import pandas as pd

# Read the CSV file into a DataFrame
df = pd.read_csv(r'/content/Wind_Turbine_2025.csv')

# Display the DataFrame using D-Tale
d = dtale.show(df, host = 'localhost', port = 8000)

# Open the browser to view the interactive D-Tale dashboard
print(d)

# Pandas Profiling

In [None]:
# pip install --upgrade ydata-profiling
!pip install ydata_profiling
import pandas as pd
from ydata_profiling import ProfileReport
import os

# Read the CSV file into a DataFrame
df = pd.read_csv(r'/content/Wind_Turbine_2025.csv')

# Generate and save EDA report
profile = ProfileReport(df, title = "Wind Turbine EDA Report", explorative = True)
output_file = "Wind_Turbine_Profile_Report.html"
profile.to_file(output_file)

print(f"✅ Report saved at: {os.path.abspath(output_file)}") # Converting a Relative Path to an Absolute Path



In [None]:
duplicate = df.duplicated()

In [None]:
duplicate

In [None]:
sum(duplicate)

#Handling missing values

In [None]:
for col in numerical_columns:
  median_val = df[col].median()
  df[col].fillna(median_val, inplace = True)
  # Save the cleaned DataFrame to a CSV file
df.to_csv('wind_turbine_clearedNV.csv', index=False)
print("Cleaned dataset saved as wind_turbine_clearedNV.csv")


In [None]:
missing_values_after_imputation = df.isnull().sum()
print("Missing values after imputation:")
print(missing_values_after_imputation)
df.to_csv('wind_turbine_cleaned.csv', index=False)
print("Cleaned dataset saved as wind_turbine_cleaned.csv")

In [None]:
!pip install feature_engine
from feature_engine.outliers import Winsorizer
winsor_iqr = Winsorizer(capping_method = 'iqr',
                        tail = 'both',
                        fold = 1.5,
                        variables = numerical_columns)

In [None]:
df_s = winsor_iqr.fit_transform(df[numerical_columns])

In [None]:
plt.figure(figsize=(14, 6))  # Wider figure to give ticks more room
sns.boxplot(data=df_s[numerical_columns])
plt.title("Boxplots After Winsorization", fontsize=14)
plt.xticks(rotation=45, ha='right')  # Rotate and align ticks
plt.tight_layout()  # Adjust layout to prevent clipping
plt.show()

In [None]:
# Calculating the variance of each numeric variable in the DataFrame
df[numerical_columns].var()

In [None]:
# Checking if the variance of each numeric variable is equal to 0 and returning a boolean Series
df[numerical_columns].var() == 0

# Discretization

In [None]:
# Bin wind speed into categories
bins = [0, 5, 10, 15, 25]
labels = ['Very Low', 'Low', 'Medium', 'High']
df['WindSpeed_Category'] = pd.cut(df['Wind_speed'], bins=bins, labels=labels)

In [None]:
df['WindSpeed_Category']

In [None]:
# Define bins and labels for Ambient_temperature
temp_bins = [-50, 0, 15, 30, 50]  # Adjust based on your data range
temp_labels = ['Very Cold', 'Cold', 'Moderate', 'Hot']
df['Ambient_temp_category'] = pd.cut(df['Ambient_temperature'], bins=temp_bins, labels=temp_labels)

In [None]:
print(df['Ambient_temp_category'].value_counts())

In [None]:
# Wind_direction → Compass sectors
direction_bins = [0, 90, 180, 270, 360]
direction_labels = ['North-East', 'East-South', 'South-West', 'West-North']
df['Wind_direction_sector'] = pd.cut(df['Wind_direction'], bins=direction_bins, labels=direction_labels, include_lowest=True)



In [None]:
df['Wind_direction_sector']

In [None]:
# Bearing_temperature
df['Bearing_temp_category'] = pd.cut(df['Bearing_temperature'], bins=temp_bins, labels=temp_labels)

In [None]:
df['Bearing_temp_category']

In [None]:
 # Nacelle_temperature
df['Nacelle_temp_category'] = pd.cut(df['Nacelle_temperature'], bins=temp_bins, labels=temp_labels)

In [None]:
df['Nacelle_temp_category']

In [None]:
#  Wind_hub_temperature
df['Hub_temp_category'] = pd.cut(df['Wheel_hub_temperature'], bins=temp_bins, labels=temp_labels)

In [None]:
df['Hub_temp_category']

In [None]:
# Inlet_temperature
df['Inlet_temp_category'] = pd.cut(df['Gear_box_inlet_temperature'], bins=temp_bins, labels=temp_labels)

In [None]:
df['Inlet_temp_category']

In [None]:
df.to_csv('wind_turbine_discretized.csv', index=False)
print("Discretized columns saved to wind_turbine_discretized.csv")

Dummy Variables

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df_new = pd.get_dummies(df).astype('int64')

In [None]:
df_new

In [None]:
import pandas as pd
# Importing the OneHotEncoder class from the sklearn.preprocessing module
from sklearn.preprocessing import OneHotEncoder

# Read the CSV file into a DataFrame
df = pd.read_csv(r'/content/Wind_Turbine_2025.csv')

# Creating an instance of the OneHotEncoder
enc = OneHotEncoder(sparse_output = False) # initializing method
# setting sparse_output=False explicitly instructs the OneHotEncoder to return a dense array instead of a sparse matrix.

# Transforming the categorical columns (from Position column onwards) into one-hot encoded format and converting to DataFrame
enc_df = pd.DataFrame(enc.fit_transform(df.iloc[:, 2:]), columns = enc.get_feature_names_out(input_features = df.iloc[:, 2:].columns))

In [None]:
enc_df

In [None]:
# Importing pandas library for data manipulation
import pandas as pd
# Importing stats module from scipy library for statistical functions
from scipy import stats

# Importing seaborn and matplotlib.pyplot for plotting
import seaborn as sns
import matplotlib.pyplot as plt
# Importing pylab module for creating plots
import pylab

# Read the CSV file into a DataFrame
df = pd.read_csv(r'/content/Wind_Turbine_2025.csv')

# Create a probability plot for a single numerical column (e.g., 'Power')
prob = stats.probplot(df['Power'], dist = stats.norm, plot = pylab)

In [None]:
# Import the necessary transformer from feature_engine
from feature_engine.transformation import YeoJohnsonTransformer

# Set up the Yeo-Johnson transformer for 'Power' variable
tf = YeoJohnsonTransformer(variables = 'Power')

Standardization and Normalization

In [None]:
import pandas as pd
df = pd.read_csv(r'/content/Wind_Turbine_2025.csv')
numerical_columns = df.select_dtypes(include='number').columns.tolist()
df_numerical = df[numerical_columns]
from sklearn.preprocessing import StandardScaler

# Initialize scaler
scaler = StandardScaler()

# Apply standardization
df_standardized = pd.DataFrame(scaler.fit_transform(df_numerical), columns=numerical_columns)

# Optional: Save standardized data
df_standardized.to_csv('wind_turbine_standardized.csv', index=False)
print("Standardized data saved to wind_turbine_standardized.csv")

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Initialize scaler
minmax = MinMaxScaler()

# Apply normalization
df_normalized = pd.DataFrame(minmax.fit_transform(df_numerical), columns=numerical_columns)

# Optional: Save normalized data
df_normalized.to_csv('wind_turbine_normalized.csv', index=False)
print("Normalized data saved to wind_turbine_normalized.csv")