In [151]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [138]:
train_data = pd.read_csv('dataset/train.csv')
train_data.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
0,WM_33725,2019-08-04 14:33:20,94.820023,-99.0,41.723019,-0.903423,82.410573,42.523015,2563.124522,76.66556,...,,239.836388,2730.310605,42.084666,BA,Medium,2.217542,0.314065,24.281689,6.766521
1,WM_698,2018-11-05 10:13:20,241.832734,27.764785,-99.0,-99.0,44.104919,46.25887,2372.384119,78.129803,...,,337.944723,1780.2072,107.888643,A2,Medium,4.210346,0.448494,27.262139,5.966275
2,WM_39146,2019-09-14 14:03:20,95.484724,,41.855473,12.652763,42.322098,42.878552,1657.169646,67.654469,...,45.033197,227.850294,1666.0499,-42.931459,ABC,Medium,2.719475,0.302321,27.366127,2.874342
3,WM_6757,2018-12-25 15:33:20,238.819424,-99.0,45.443914,15.115323,44.759643,47.282101,2888.134079,95.389974,...,44.827154,492.08152,1964.502895,42.744596,ABC,,4.857385,0.36714,24.287767,14.851089
4,WM_21521,2019-05-04 03:13:20,10.72289,,41.981183,1.715696,-17.616459,43.469852,781.695419,37.423065,...,-99.0,259.274601,1177.516152,13.387289,AAA,Medium,,0.453374,27.97165,3.519074


In [139]:
drop_columns = ['tracking_id','datetime']
train_data.drop(drop_columns,inplace=True,axis=1)

In [140]:
train_data.isna().sum() / len(train_data) * 100

wind_speed(m/s)                    0.968085
atmospheric_temperature(°C)       12.234043
shaft_temperature(°C)              0.007092
blades_angle(°)                    0.765957
gearbox_temperature(°C)            0.003546
engine_temperature(°C)             0.042553
motor_torque(N-m)                  0.085106
generator_temperature(°C)          0.042553
atmospheric_pressure(Pascal)       9.599291
area_temperature(°C)               0.000000
windmill_body_temperature(°C)      8.379433
wind_direction(°)                 18.095745
resistance(ohm)                    0.003546
rotor_torque(N-m)                  2.028369
turbine_status                     6.237589
cloud_level                        0.978723
blade_length(m)                   18.060284
blade_breadth(m)                   0.000000
windmill_height(m)                 1.925532
windmill_generated_power(kW/h)     0.734043
dtype: float64

In [141]:
train_data.dtypes

wind_speed(m/s)                   float64
atmospheric_temperature(°C)       float64
shaft_temperature(°C)             float64
blades_angle(°)                   float64
gearbox_temperature(°C)           float64
engine_temperature(°C)            float64
motor_torque(N-m)                 float64
generator_temperature(°C)         float64
atmospheric_pressure(Pascal)      float64
area_temperature(°C)              float64
windmill_body_temperature(°C)     float64
wind_direction(°)                 float64
resistance(ohm)                   float64
rotor_torque(N-m)                 float64
turbine_status                     object
cloud_level                        object
blade_length(m)                   float64
blade_breadth(m)                  float64
windmill_height(m)                float64
windmill_generated_power(kW/h)    float64
dtype: object

In [142]:
train_data.loc[:,train_data.dtypes == 'object'].nunique()

turbine_status    14
cloud_level        3
dtype: int64

In [143]:
print(train_data['turbine_status'].value_counts(),'\n'*3,
train_data['cloud_level'].value_counts())

BB     1946
AAA    1939
BCB    1933
B2     1931
A      1930
D      1922
B      1882
AB     1868
ABC    1867
A2     1855
BA     1854
AC     1850
BD     1843
BBB    1821
Name: turbine_status, dtype: int64 


 Low              13921
Medium           13704
Extremely Low      299
Name: cloud_level, dtype: int64


In [144]:
train_data = train_data[train_data['turbine_status'].notna()]
train_data = train_data[train_data['cloud_level'].notna()]
len(train_data)

26176

In [145]:
continous_cols = train_data.dtypes[train_data.dtypes != 'object'].index
(continous_cols)

Index(['wind_speed(m/s)', 'atmospheric_temperature(°C)',
       'shaft_temperature(°C)', 'blades_angle(°)', 'gearbox_temperature(°C)',
       'engine_temperature(°C)', 'motor_torque(N-m)',
       'generator_temperature(°C)', 'atmospheric_pressure(Pascal)',
       'area_temperature(°C)', 'windmill_body_temperature(°C)',
       'wind_direction(°)', 'resistance(ohm)', 'rotor_torque(N-m)',
       'blade_length(m)', 'blade_breadth(m)', 'windmill_height(m)',
       'windmill_generated_power(kW/h)'],
      dtype='object')

In [146]:
for col in continous_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mean())

In [147]:
train_data.isna().sum() / len(train_data) * 100

wind_speed(m/s)                   0.0
atmospheric_temperature(°C)       0.0
shaft_temperature(°C)             0.0
blades_angle(°)                   0.0
gearbox_temperature(°C)           0.0
engine_temperature(°C)            0.0
motor_torque(N-m)                 0.0
generator_temperature(°C)         0.0
atmospheric_pressure(Pascal)      0.0
area_temperature(°C)              0.0
windmill_body_temperature(°C)     0.0
wind_direction(°)                 0.0
resistance(ohm)                   0.0
rotor_torque(N-m)                 0.0
turbine_status                    0.0
cloud_level                       0.0
blade_length(m)                   0.0
blade_breadth(m)                  0.0
windmill_height(m)                0.0
windmill_generated_power(kW/h)    0.0
dtype: float64

In [148]:
train_data.describe()

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
count,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0
mean,68.927439,0.458399,40.06702,-9.743748,41.032699,42.631397,1712.853025,65.090239,52577.84,32.752775,20.894347,306.997781,1575.903442,25.925809,2.213593,0.397071,25.90065,6.133326
std,75.66395,41.475223,27.272877,47.83761,43.722545,6.093881,826.837778,19.809137,177994.7,7.716299,52.032731,121.432648,485.008451,32.075801,10.373646,0.06118,7.705826,2.693214
min,-402.608736,-99.0,-99.0,-146.259543,-244.974098,3.167151,500.0,33.893779,-1188624.0,-30.0,-999.0,0.0,-1005.222988,-136.732217,-99.0,0.200111,-30.295253,0.962305
25%,21.304026,0.458399,41.638432,-1.209093,40.559884,41.913381,871.569504,41.216874,16891.31,27.322601,20.894347,246.746144,1268.604476,13.848275,2.213593,0.347309,24.468078,4.07675
50%,93.261685,14.201454,43.691351,-0.528272,43.229537,43.526359,2032.661501,70.753995,18741.84,32.620193,42.506184,290.140374,1678.402416,31.753112,3.050854,0.398486,25.90065,5.799092
75%,95.24802,22.636619,45.679701,5.366895,45.874994,45.18733,2464.074854,79.000425,116372.6,38.261766,44.33816,332.834028,1830.504684,41.387052,4.156647,0.449093,27.451608,7.932078
max,601.45567,80.217444,169.820455,165.932123,999.0,50.0,3000.0,100.0,1272552.0,55.0,323.0,569.966479,4693.481933,236.883264,18.2098,0.499975,78.351335,20.175358


In [149]:
label_enc = LabelEncoder()
train_data['cloud_level'] = label_enc.fit_transform(train_data['cloud_level'])
train_data['cloud_level'].value_counts()

1    13050
2    12842
0      284
Name: cloud_level, dtype: int64

In [150]:
train_data['cloud_level'] = train_data['cloud_level'].apply(lambda x:  x + 1)
train_data['cloud_level']
train_data['cloud_level'].value_counts()

2    13050
3    12842
1      284
Name: cloud_level, dtype: int64

In [153]:
oh_enc = OneHotEncoder()
train_data['turbine_status'] = oh_enc.fit_transform(train_data['turbine_status'])
train_data['turbine_status'].value_counts()

ValueError: Expected 2D array, got 1D array instead:
array=[ 8  1  4 ... 13 11 10].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.