In [227]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [228]:
train_data = pd.read_csv('dataset/train.csv')
test_data = pd.read_csv('dataset/test.csv')
test_data.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m)
0,WM_19817,2017-04-19 08:53:20,94.324266,17.641186,89.714193,51.146788,40.46056,39.594734,1073.202715,66.830037,...,24.004812,43.756693,445.976992,1664.222023,21.912243,BA,Medium,3.185837,0.403965,25.572431
1,WM_18723,1930-03-19 07:43:20,10.08887,13.978119,43.272846,46.516394,40.027788,41.17686,517.43643,37.284163,...,29.431813,42.728174,499.595287,1165.111992,-35.050093,A,Medium,3.016603,0.444755,24.371823
2,WM_34552,2010-08-19 11:33:20,347.15209,31.423035,41.07664,26.931602,43.109122,43.439556,1480.716492,70.010762,...,29.924235,43.256122,245.432231,1667.720491,27.195302,B2,Medium,2.611941,0.387368,27.654677
3,WM_28570,2026-06-19 03:53:20,24.471997,-99.0,14.375078,66.513953,13.741253,15.577472,887.979475,41.445258,...,23.886434,13.501595,,1329.74474,15.245757,BBB,Low,2.866805,0.450478,24.189426
4,WM_36934,2027-08-19 16:43:20,96.997026,33.281836,41.405192,1.843112,121.572907,43.934587,2053.916354,68.007787,...,35.906889,-99.0,442.425744,691.408996,34.257024,A,Low,3.549672,0.368355,4.88544


In [229]:
drop_columns = ['tracking_id','datetime']
extra = test_data[drop_columns]
train_data.drop(drop_columns,inplace=True,axis=1)
test_data.drop(drop_columns,inplace=True,axis=1)

In [230]:
train_data.isna().sum() / len(train_data) * 100

wind_speed(m/s)                    0.968085
atmospheric_temperature(°C)       12.234043
shaft_temperature(°C)              0.007092
blades_angle(°)                    0.765957
gearbox_temperature(°C)            0.003546
engine_temperature(°C)             0.042553
motor_torque(N-m)                  0.085106
generator_temperature(°C)          0.042553
atmospheric_pressure(Pascal)       9.599291
area_temperature(°C)               0.000000
windmill_body_temperature(°C)      8.379433
wind_direction(°)                 18.095745
resistance(ohm)                    0.003546
rotor_torque(N-m)                  2.028369
turbine_status                     6.237589
cloud_level                        0.978723
blade_length(m)                   18.060284
blade_breadth(m)                   0.000000
windmill_height(m)                 1.925532
windmill_generated_power(kW/h)     0.734043
dtype: float64

In [231]:
train_data.dtypes

wind_speed(m/s)                   float64
atmospheric_temperature(°C)       float64
shaft_temperature(°C)             float64
blades_angle(°)                   float64
gearbox_temperature(°C)           float64
engine_temperature(°C)            float64
motor_torque(N-m)                 float64
generator_temperature(°C)         float64
atmospheric_pressure(Pascal)      float64
area_temperature(°C)              float64
windmill_body_temperature(°C)     float64
wind_direction(°)                 float64
resistance(ohm)                   float64
rotor_torque(N-m)                 float64
turbine_status                     object
cloud_level                        object
blade_length(m)                   float64
blade_breadth(m)                  float64
windmill_height(m)                float64
windmill_generated_power(kW/h)    float64
dtype: object

In [232]:
train_data.loc[:,train_data.dtypes == 'object'].nunique()

turbine_status    14
cloud_level        3
dtype: int64

In [233]:
print(train_data['turbine_status'].value_counts(),'\n'*3,
train_data['cloud_level'].value_counts())

BB     1946
AAA    1939
BCB    1933
B2     1931
A      1930
D      1922
B      1882
AB     1868
ABC    1867
A2     1855
BA     1854
AC     1850
BD     1843
BBB    1821
Name: turbine_status, dtype: int64 


 Low              13921
Medium           13704
Extremely Low      299
Name: cloud_level, dtype: int64


In [234]:
train_data = train_data[train_data['turbine_status'].notna()]
train_data = train_data[train_data['cloud_level'].notna()]
len(train_data)

26176

In [235]:
continous_cols = train_data.dtypes[train_data.dtypes != 'object'].index
continous_cols_test = test_data.dtypes[test_data.dtypes != 'object'].index
(continous_cols)

Index(['wind_speed(m/s)', 'atmospheric_temperature(°C)',
       'shaft_temperature(°C)', 'blades_angle(°)', 'gearbox_temperature(°C)',
       'engine_temperature(°C)', 'motor_torque(N-m)',
       'generator_temperature(°C)', 'atmospheric_pressure(Pascal)',
       'area_temperature(°C)', 'windmill_body_temperature(°C)',
       'wind_direction(°)', 'resistance(ohm)', 'rotor_torque(N-m)',
       'blade_length(m)', 'blade_breadth(m)', 'windmill_height(m)',
       'windmill_generated_power(kW/h)'],
      dtype='object')

In [236]:
for col in continous_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mean())
for col in continous_cols_test:
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [237]:
test_data['cloud_level'].value_counts()
test_data['cloud_level'].fillna('Low',inplace = True)

In [238]:
test_data['turbine_status'].value_counts()
test_data['turbine_status'].fillna('BB',inplace = True)

In [239]:
test_data.isna().sum() / len(test_data) * 100

wind_speed(m/s)                  0.0
atmospheric_temperature(°C)      0.0
shaft_temperature(°C)            0.0
blades_angle(°)                  0.0
gearbox_temperature(°C)          0.0
engine_temperature(°C)           0.0
motor_torque(N-m)                0.0
generator_temperature(°C)        0.0
atmospheric_pressure(Pascal)     0.0
area_temperature(°C)             0.0
windmill_body_temperature(°C)    0.0
wind_direction(°)                0.0
resistance(ohm)                  0.0
rotor_torque(N-m)                0.0
turbine_status                   0.0
cloud_level                      0.0
blade_length(m)                  0.0
blade_breadth(m)                 0.0
windmill_height(m)               0.0
dtype: float64

In [240]:
train_data.describe()

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
count,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0,26176.0
mean,68.927439,0.458399,40.06702,-9.743748,41.032699,42.631397,1712.853025,65.090239,52577.84,32.752775,20.894347,306.997781,1575.903442,25.925809,2.213593,0.397071,25.90065,6.133326
std,75.66395,41.475223,27.272877,47.83761,43.722545,6.093881,826.837778,19.809137,177994.7,7.716299,52.032731,121.432648,485.008451,32.075801,10.373646,0.06118,7.705826,2.693214
min,-402.608736,-99.0,-99.0,-146.259543,-244.974098,3.167151,500.0,33.893779,-1188624.0,-30.0,-999.0,0.0,-1005.222988,-136.732217,-99.0,0.200111,-30.295253,0.962305
25%,21.304026,0.458399,41.638432,-1.209093,40.559884,41.913381,871.569504,41.216874,16891.31,27.322601,20.894347,246.746144,1268.604476,13.848275,2.213593,0.347309,24.468078,4.07675
50%,93.261685,14.201454,43.691351,-0.528272,43.229537,43.526359,2032.661501,70.753995,18741.84,32.620193,42.506184,290.140374,1678.402416,31.753112,3.050854,0.398486,25.90065,5.799092
75%,95.24802,22.636619,45.679701,5.366895,45.874994,45.18733,2464.074854,79.000425,116372.6,38.261766,44.33816,332.834028,1830.504684,41.387052,4.156647,0.449093,27.451608,7.932078
max,601.45567,80.217444,169.820455,165.932123,999.0,50.0,3000.0,100.0,1272552.0,55.0,323.0,569.966479,4693.481933,236.883264,18.2098,0.499975,78.351335,20.175358


In [241]:
label_enc = LabelEncoder()
train_data['cloud_level'] = label_enc.fit_transform(train_data['cloud_level'])
train_data['cloud_level'].value_counts()
test_data['cloud_level'] = label_enc.fit_transform(test_data['cloud_level'])
test_data['cloud_level'].value_counts()

1    6057
2    5893
0     136
Name: cloud_level, dtype: int64

In [242]:
train_data['cloud_level'] = train_data['cloud_level'].apply(lambda x:  x + 1)
test_data['cloud_level'] = test_data['cloud_level'].apply(lambda x:  x + 1)
train_data['cloud_level']
train_data['cloud_level'].value_counts()

2    13050
3    12842
1      284
Name: cloud_level, dtype: int64

In [243]:
enc = pd.get_dummies(train_data['turbine_status'])
# print(enc)
train_data.drop('turbine_status',axis = 1,inplace = True)

In [244]:
enc = pd.get_dummies(test_data['turbine_status'])
# print(enc)
test_data.drop('turbine_status',axis = 1,inplace = True)

In [245]:
train_data = train_data.join(enc)
test_data = test_data.join(enc)

In [246]:
train_data.columns

Index(['wind_speed(m/s)', 'atmospheric_temperature(°C)',
       'shaft_temperature(°C)', 'blades_angle(°)', 'gearbox_temperature(°C)',
       'engine_temperature(°C)', 'motor_torque(N-m)',
       'generator_temperature(°C)', 'atmospheric_pressure(Pascal)',
       'area_temperature(°C)', 'windmill_body_temperature(°C)',
       'wind_direction(°)', 'resistance(ohm)', 'rotor_torque(N-m)',
       'cloud_level', 'blade_length(m)', 'blade_breadth(m)',
       'windmill_height(m)', 'windmill_generated_power(kW/h)', 'A', 'A2',
       'AAA', 'AB', 'ABC', 'AC', 'B', 'B2', 'BA', 'BB', 'BBB', 'BCB', 'BD',
       'D'],
      dtype='object')

In [247]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [248]:
x = train_data.drop(['windmill_generated_power(kW/h)'],axis = 1)
y = train_data['windmill_generated_power(kW/h)']
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.8,random_state = 42)

In [249]:
model = XGBRegressor()
model.fit(x_train,y_train)
predictions = model.predict(x_test)
r2_score(predictions,y_test)



0.9276638125464726

In [250]:
test_data

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),...,ABC,AC,B,B2,BA,BB,BBB,BCB,BD,D
0,94.324266,17.641186,89.714193,51.146788,40.460560,39.594734,1073.202715,66.830037,16681.044120,24.004812,...,0,0,0,0,1,0,0,0,0,0
1,10.088870,13.978119,43.272846,46.516394,40.027788,41.176860,517.436430,37.284163,54283.324683,29.431813,...,0,0,0,0,0,0,0,0,0,0
2,347.152090,31.423035,41.076640,26.931602,43.109122,43.439556,1480.716492,70.010762,214812.836200,29.924235,...,0,0,0,1,0,0,0,0,0,0
3,24.471997,-99.000000,14.375078,66.513953,13.741253,15.577472,887.979475,41.445258,54283.324683,23.886434,...,0,0,0,0,0,0,1,0,0,0
4,96.997026,33.281836,41.405192,1.843112,121.572907,43.934587,2053.916354,68.007787,16833.546520,35.906889,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12081,-15.039400,18.517204,43.545409,-1.094038,44.550547,44.486619,2424.370244,75.850209,17239.345510,28.424722,...,0,0,0,0,0,1,0,0,0,0
12082,97.594116,-99.000000,44.747916,1.750553,43.308176,42.848779,2085.627252,73.091866,17131.448670,50.615296,...,0,0,0,0,0,0,0,0,0,0
12083,16.186805,8.109976,43.252012,-0.691090,214.904165,41.177782,788.687177,39.514258,118707.643700,20.983939,...,0,0,0,0,1,0,0,0,0,0
12084,93.254033,-99.000000,43.841789,-1.171974,41.976913,42.039796,2079.855634,71.482324,17155.269310,29.114305,...,0,0,0,0,0,0,0,0,0,0


In [251]:
submission = model.predict(test_data)
submission.shape

(12086,)

In [252]:
extra = extra.join(pd.DataFrame(submission))
target = 'windmill_generated_power(kW/h)'
extra[target] = extra[0]
extra.drop([0],inplace = True,axis = 1)
extra

Unnamed: 0,tracking_id,datetime,windmill_generated_power(kW/h)
0,WM_19817,2017-04-19 08:53:20,2.605420
1,WM_18723,1930-03-19 07:43:20,2.872419
2,WM_34552,2010-08-19 11:33:20,3.039419
3,WM_28570,2026-06-19 03:53:20,7.504499
4,WM_36934,2027-08-19 16:43:20,3.874941
...,...,...,...
12081,WM_13376,2012-02-19 11:33:20,6.796721
12082,WM_1630,2012-11-18 17:33:20,3.578464
12083,WM_24703,2027-05-19 11:53:20,5.202962
12084,WM_22893,2013-05-19 21:53:20,4.920625


In [253]:
extra.to_csv('submission-1.csv',index = False)