# Exploratory Data Analysis (EDA)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from scipy import stats
from scipy.stats import normaltest, shapiro

warnings.filterwarnings('ignore')
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")
%matplotlib inline

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

Matplotlib is building the font cache; this may take a moment.


### 1. Data Loading

In [3]:
columns = ['unit_id', 'time_cycles', 'op_setting_1', 'op_setting_2', 'op_setting_3'] + \
          [f'sensor_{i}' for i in range(1, 22)]
train_df = pd.read_csv('../data/raw/train_FD001.txt', sep='\s+', header=None, names=columns)
test_df = pd.read_csv('../data/raw/test_FD001.txt', sep='\s+', header=None, names=columns)
rul_df = pd.read_csv('../data/raw/RUL_FD001.txt', sep='\s+', header=None, names=['RUL'])

print("Data loaded successfully!")
print(f"Training set shape: {train_df.shape}")
print(f"Test set shape: {test_df.shape}")
print(f"RUL labels shape: {rul_df.shape}")

Data loaded successfully!
Training set shape: (20631, 26)
Test set shape: (13096, 26)
RUL labels shape: (100, 1)


### 2. Initial Data Inspection

In [4]:
print("First 5 Rows Of Data")
display(train_df.head())

print("Last 5 Rows Of Data")
display(train_df.tail())

First 5 Rows Of Data


Unnamed: 0,unit_id,time_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,21.61,554.36,2388.06,9046.19,1.3,47.47,521.66,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,21.61,553.75,2388.04,9044.07,1.3,47.49,522.28,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,21.61,554.26,2388.08,9052.94,1.3,47.27,522.42,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,21.61,554.45,2388.11,9049.48,1.3,47.13,522.86,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,21.61,554.0,2388.06,9055.15,1.3,47.28,522.19,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044


Last 5 Rows Of Data


Unnamed: 0,unit_id,time_cycles,op_setting_1,op_setting_2,op_setting_3,sensor_1,sensor_2,sensor_3,sensor_4,sensor_5,sensor_6,sensor_7,sensor_8,sensor_9,sensor_10,sensor_11,sensor_12,sensor_13,sensor_14,sensor_15,sensor_16,sensor_17,sensor_18,sensor_19,sensor_20,sensor_21
20626,100,196,-0.0004,-0.0003,100.0,518.67,643.49,1597.98,1428.63,14.62,21.61,551.43,2388.19,9065.52,1.3,48.07,519.49,2388.26,8137.6,8.4956,0.03,397,2388,100.0,38.49,22.9735
20627,100,197,-0.0016,-0.0005,100.0,518.67,643.54,1604.5,1433.58,14.62,21.61,550.86,2388.23,9065.11,1.3,48.04,519.68,2388.22,8136.5,8.5139,0.03,395,2388,100.0,38.3,23.1594
20628,100,198,0.0004,0.0,100.0,518.67,643.42,1602.46,1428.18,14.62,21.61,550.94,2388.24,9065.9,1.3,48.09,520.01,2388.24,8141.05,8.5646,0.03,398,2388,100.0,38.44,22.9333
20629,100,199,-0.0011,0.0003,100.0,518.67,643.23,1605.26,1426.53,14.62,21.61,550.68,2388.25,9073.72,1.3,48.39,519.67,2388.23,8139.29,8.5389,0.03,395,2388,100.0,38.29,23.064
20630,100,200,-0.0032,-0.0005,100.0,518.67,643.85,1600.38,1432.14,14.62,21.61,550.79,2388.26,9061.48,1.3,48.2,519.3,2388.26,8137.33,8.5036,0.03,396,2388,100.0,38.37,23.0522


In [5]:
print("Data Information")
train_df.info()

Data Information
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20631 entries, 0 to 20630
Data columns (total 26 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   unit_id       20631 non-null  int64  
 1   time_cycles   20631 non-null  int64  
 2   op_setting_1  20631 non-null  float64
 3   op_setting_2  20631 non-null  float64
 4   op_setting_3  20631 non-null  float64
 5   sensor_1      20631 non-null  float64
 6   sensor_2      20631 non-null  float64
 7   sensor_3      20631 non-null  float64
 8   sensor_4      20631 non-null  float64
 9   sensor_5      20631 non-null  float64
 10  sensor_6      20631 non-null  float64
 11  sensor_7      20631 non-null  float64
 12  sensor_8      20631 non-null  float64
 13  sensor_9      20631 non-null  float64
 14  sensor_10     20631 non-null  float64
 15  sensor_11     20631 non-null  float64
 16  sensor_12     20631 non-null  float64
 17  sensor_13     20631 non-null  float64
 18  sensor_14

In [6]:
print("Missing Values Check")
missing = train_df.isnull().sum()
if missing.sum() == 0:
    print("No missing values found!")
else:
    print(missing[missing > 0])

Missing Values Check
No missing values found!


In [7]:
print("Duplicate Check")
duplicates = train_df.duplicated().sum()
print(f"Number of duplicate rows: {duplicates}")
if duplicates > 0:
    print("Warning: Duplicates found!")
else:
    print("No duplicates found!")

Duplicate Check
Number of duplicate rows: 0
No duplicates found!


### 3. Statistical Summary

In [8]:
print("Statistical Summary")
display(train_df.describe().T)

Statistical Summary


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_id,20631.0,51.506568,29.22763,1.0,26.0,52.0,77.0,100.0
time_cycles,20631.0,108.807862,68.88099,1.0,52.0,104.0,156.0,362.0
op_setting_1,20631.0,-9e-06,0.002187313,-0.0087,-0.0015,0.0,0.0015,0.0087
op_setting_2,20631.0,2e-06,0.0002930621,-0.0006,-0.0002,0.0,0.0003,0.0006
op_setting_3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
sensor_1,20631.0,518.67,0.0,518.67,518.67,518.67,518.67,518.67
sensor_2,20631.0,642.680934,0.5000533,641.21,642.325,642.64,643.0,644.53
sensor_3,20631.0,1590.523119,6.13115,1571.04,1586.26,1590.1,1594.38,1616.91
sensor_4,20631.0,1408.933782,9.000605,1382.25,1402.36,1408.04,1414.555,1441.49
sensor_5,20631.0,14.62,1.7764e-15,14.62,14.62,14.62,14.62,14.62


In [9]:
print("Data Distribution Metrics")

distribution_stats = pd.DataFrame({
    'skewness': train_df.skew(),
    'kurtosis': train_df.kurtosis()
})

print("\nSkewness interpretation:")
print("  < -1 or > 1: Highly skewed")
print("  -1 to -0.5 or 0.5 to 1: Moderately skewed")
print("  -0.5 to 0.5: Approximately symmetric")
print("\nKurtosis interpretation:")
print("  > 3: Heavy tails (more outliers)")
print("  = 3: Normal distribution")
print("  < 3: Light tails (fewer outliers)")

display(distribution_stats)

Data Distribution Metrics

Skewness interpretation:
  < -1 or > 1: Highly skewed
  -1 to -0.5 or 0.5 to 1: Moderately skewed
  -0.5 to 0.5: Approximately symmetric

Kurtosis interpretation:
  > 3: Heavy tails (more outliers)
  = 3: Normal distribution
  < 3: Light tails (fewer outliers)


Unnamed: 0,skewness,kurtosis
unit_id,-0.067815,-1.219824
time_cycles,0.499904,-0.218539
op_setting_1,-0.024766,-0.009132
op_setting_2,0.009085,-1.130447
op_setting_3,0.0,0.0
sensor_1,0.0,0.0
sensor_2,0.316526,-0.112043
sensor_3,0.308946,0.007762
sensor_4,0.443194,-0.163681
sensor_5,0.0,0.0
