In [368]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

from sklearn.metrics import r2_score
from sklearn.preprocessing import LabelEncoder,OneHotEncoder

In [369]:
train_data = pd.read_csv('dataset/train.csv')
test_data = pd.read_csv('dataset/test.csv')
test_data.head()

Unnamed: 0,tracking_id,datetime,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),...,area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),turbine_status,cloud_level,blade_length(m),blade_breadth(m),windmill_height(m)
0,WM_19817,2019-04-17 08:53:20,94.324266,17.641186,89.714193,51.146788,40.46056,39.594734,1073.202715,66.830037,...,24.004812,43.756693,445.976992,1664.222023,21.912243,BA,Medium,3.185837,0.403965,25.572431
1,WM_18723,2019-03-30 07:43:20,10.08887,13.978119,43.272846,46.516394,40.027788,41.17686,517.43643,37.284163,...,29.431813,42.728174,499.595287,1165.111992,-35.050093,A,Medium,3.016603,0.444755,24.371823
2,WM_34552,2019-08-10 11:33:20,347.15209,31.423035,41.07664,26.931602,43.109122,43.439556,1480.716492,70.010762,...,29.924235,43.256122,245.432231,1667.720491,27.195302,B2,Medium,2.611941,0.387368,27.654677
3,WM_28570,2019-06-26 03:53:20,24.471997,-99.0,14.375078,66.513953,13.741253,15.577472,887.979475,41.445258,...,23.886434,13.501595,,1329.74474,15.245757,BBB,Low,2.866805,0.450478,24.189426
4,WM_36934,2019-08-27 16:43:20,96.997026,33.281836,41.405192,1.843112,121.572907,43.934587,2053.916354,68.007787,...,35.906889,-99.0,442.425744,691.408996,34.257024,A,Low,3.549672,0.368355,4.88544


In [370]:
drop_columns = ['tracking_id','datetime']
extra = test_data[drop_columns]
train_data.drop(drop_columns,inplace=True,axis=1)
test_data.drop(drop_columns,inplace=True,axis=1)

In [371]:
train_data.isna().sum() / len(train_data) * 100

wind_speed(m/s)                    0.968085
atmospheric_temperature(°C)       12.234043
shaft_temperature(°C)              0.007092
blades_angle(°)                    0.765957
gearbox_temperature(°C)            0.003546
engine_temperature(°C)             0.042553
motor_torque(N-m)                  0.085106
generator_temperature(°C)          0.042553
atmospheric_pressure(Pascal)       9.599291
area_temperature(°C)               0.000000
windmill_body_temperature(°C)      8.379433
wind_direction(°)                 18.095745
resistance(ohm)                    0.003546
rotor_torque(N-m)                  2.028369
turbine_status                     6.237589
cloud_level                        0.978723
blade_length(m)                   18.060284
blade_breadth(m)                   0.000000
windmill_height(m)                 1.925532
windmill_generated_power(kW/h)     0.734043
dtype: float64

In [372]:
train_data.dtypes

wind_speed(m/s)                   float64
atmospheric_temperature(°C)       float64
shaft_temperature(°C)             float64
blades_angle(°)                   float64
gearbox_temperature(°C)           float64
engine_temperature(°C)            float64
motor_torque(N-m)                 float64
generator_temperature(°C)         float64
atmospheric_pressure(Pascal)      float64
area_temperature(°C)              float64
windmill_body_temperature(°C)     float64
wind_direction(°)                 float64
resistance(ohm)                   float64
rotor_torque(N-m)                 float64
turbine_status                     object
cloud_level                        object
blade_length(m)                   float64
blade_breadth(m)                  float64
windmill_height(m)                float64
windmill_generated_power(kW/h)    float64
dtype: object

In [373]:
train_data.loc[:,train_data.dtypes == 'object'].nunique()

turbine_status    14
cloud_level        3
dtype: int64

In [374]:
print(train_data['turbine_status'].value_counts(),'\n'*3,
train_data['cloud_level'].value_counts())

BB     1946
AAA    1939
BCB    1933
B2     1931
A      1930
D      1922
B      1882
AB     1868
ABC    1867
A2     1855
BA     1854
AC     1850
BD     1843
BBB    1821
Name: turbine_status, dtype: int64 


 Low              13921
Medium           13704
Extremely Low      299
Name: cloud_level, dtype: int64


In [375]:
continous_cols = train_data.dtypes[train_data.dtypes != 'object'].index
continous_cols_test = test_data.dtypes[test_data.dtypes != 'object'].index
(continous_cols)

Index(['wind_speed(m/s)', 'atmospheric_temperature(°C)',
       'shaft_temperature(°C)', 'blades_angle(°)', 'gearbox_temperature(°C)',
       'engine_temperature(°C)', 'motor_torque(N-m)',
       'generator_temperature(°C)', 'atmospheric_pressure(Pascal)',
       'area_temperature(°C)', 'windmill_body_temperature(°C)',
       'wind_direction(°)', 'resistance(ohm)', 'rotor_torque(N-m)',
       'blade_length(m)', 'blade_breadth(m)', 'windmill_height(m)',
       'windmill_generated_power(kW/h)'],
      dtype='object')

In [376]:
for col in continous_cols:
    train_data[col] = train_data[col].fillna(train_data[col].mean())
for col in continous_cols_test:
    test_data[col] = test_data[col].fillna(test_data[col].mean())

In [377]:
train_data['cloud_level'].value_counts()
train_data['cloud_level'].fillna('Low',inplace = True)

In [378]:
train_data['turbine_status'].value_counts()
train_data['turbine_status'].fillna('BB',inplace = True)


In [379]:
test_data['cloud_level'].value_counts()
test_data['cloud_level'].fillna('Low',inplace = True)

In [380]:
test_data['turbine_status'].value_counts()
test_data['turbine_status'].fillna('BB',inplace = True)


In [381]:
test_data.isna().sum() / len(test_data) * 100

wind_speed(m/s)                  0.0
atmospheric_temperature(°C)      0.0
shaft_temperature(°C)            0.0
blades_angle(°)                  0.0
gearbox_temperature(°C)          0.0
engine_temperature(°C)           0.0
motor_torque(N-m)                0.0
generator_temperature(°C)        0.0
atmospheric_pressure(Pascal)     0.0
area_temperature(°C)             0.0
windmill_body_temperature(°C)    0.0
wind_direction(°)                0.0
resistance(ohm)                  0.0
rotor_torque(N-m)                0.0
turbine_status                   0.0
cloud_level                      0.0
blade_length(m)                  0.0
blade_breadth(m)                 0.0
windmill_height(m)               0.0
dtype: float64

In [382]:
train_data.describe()

Unnamed: 0,wind_speed(m/s),atmospheric_temperature(°C),shaft_temperature(°C),blades_angle(°),gearbox_temperature(°C),engine_temperature(°C),motor_torque(N-m),generator_temperature(°C),atmospheric_pressure(Pascal),area_temperature(°C),windmill_body_temperature(°C),wind_direction(°),resistance(ohm),rotor_torque(N-m),blade_length(m),blade_breadth(m),windmill_height(m),windmill_generated_power(kW/h)
count,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0,28200.0
mean,69.037071,0.383727,40.085387,-9.654038,41.027755,42.614239,1710.819803,65.027857,53185.06,32.735091,20.799761,306.888883,1575.560011,25.849894,2.254034,0.397249,25.887052,6.130529
std,75.905527,41.481572,27.203465,47.734286,43.662831,6.123243,826.853447,19.812283,178276.8,7.703391,52.029149,121.321415,483.317824,32.093408,10.206698,0.061158,7.698401,2.687601
min,-496.211029,-99.0,-99.0,-146.259543,-244.974098,3.167151,500.0,33.893779,-1188624.0,-30.0,-999.0,0.0,-1005.222988,-136.732217,-99.0,0.200111,-30.295253,0.962305
25%,21.250907,0.383727,41.632797,-1.206613,40.557993,41.911899,870.621152,41.200625,16892.69,27.311644,20.799761,246.793261,1268.141402,13.829618,2.254034,0.347445,24.46938,4.076167
50%,93.256368,14.150602,43.685853,-0.527911,43.221521,43.524018,2031.299986,70.722083,18753.03,32.605195,42.495191,290.113549,1678.231923,31.549128,3.049007,0.398591,25.899163,5.791403
75%,95.24802,22.575311,45.673234,5.370015,45.879212,45.173397,2462.164052,78.939478,116425.2,38.232387,44.325305,332.325695,1829.044723,41.337104,4.156647,0.449354,27.45108,7.931295
max,601.45567,80.217444,169.820455,165.932123,999.0,50.0,3000.0,100.0,1272552.0,55.0,323.0,569.966479,4693.481933,236.883264,18.2098,0.499975,78.351335,20.175358


In [383]:
label_enc = LabelEncoder()
train_data['cloud_level'] = label_enc.fit_transform(train_data['cloud_level'])
train_data['cloud_level'].value_counts()
test_data['cloud_level'] = label_enc.fit_transform(test_data['cloud_level'])
test_data['cloud_level'].value_counts()

1    6057
2    5893
0     136
Name: cloud_level, dtype: int64

In [384]:
train_data['cloud_level'] = train_data['cloud_level'].apply(lambda x:  x + 1)
test_data['cloud_level'] = test_data['cloud_level'].apply(lambda x:  x + 1)
train_data['cloud_level']
train_data['cloud_level'].value_counts()

2    14197
3    13704
1      299
Name: cloud_level, dtype: int64

In [385]:
enc = pd.get_dummies(train_data['turbine_status'])
train_data.drop('turbine_status',axis = 1,inplace = True)

In [386]:
enc = pd.get_dummies(test_data['turbine_status'])
test_data.drop('turbine_status',axis = 1,inplace = True)

In [387]:
train_data = train_data.join(enc)
test_data = test_data.join(enc)

In [388]:
train_data.columns

Index(['wind_speed(m/s)', 'atmospheric_temperature(°C)',
       'shaft_temperature(°C)', 'blades_angle(°)', 'gearbox_temperature(°C)',
       'engine_temperature(°C)', 'motor_torque(N-m)',
       'generator_temperature(°C)', 'atmospheric_pressure(Pascal)',
       'area_temperature(°C)', 'windmill_body_temperature(°C)',
       'wind_direction(°)', 'resistance(ohm)', 'rotor_torque(N-m)',
       'cloud_level', 'blade_length(m)', 'blade_breadth(m)',
       'windmill_height(m)', 'windmill_generated_power(kW/h)', 'A', 'A2',
       'AAA', 'AB', 'ABC', 'AC', 'B', 'B2', 'BA', 'BB', 'BBB', 'BCB', 'BD',
       'D'],
      dtype='object')

In [389]:
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [390]:
x = train_data.drop(['windmill_generated_power(kW/h)'],axis = 1)
y = train_data['windmill_generated_power(kW/h)']
x_train,x_test,y_train,y_test = train_test_split(x,y,train_size = 0.8,random_state = 42)

In [391]:
model = XGBRegressor(n_estimators=500,max_depth=5,booster='gbtree',n_jobs=-1,learning_rate=0.1,reg_lambda=0.01,reg_alpha=0.3)
model.fit(x_train,y_train)
predictions = model.predict(x_test)
r2_score(predictions,y_test)



0.9527556451532524

In [392]:
submission = model.predict(test_data)
submission.shape

(12086,)

In [393]:
extra = extra.join(pd.DataFrame(submission))
target = 'windmill_generated_power(kW/h)'
extra[target] = extra[0]
extra.drop([0],inplace = True,axis = 1)
extra

Unnamed: 0,tracking_id,datetime,windmill_generated_power(kW/h)
0,WM_19817,2019-04-17 08:53:20,2.547599
1,WM_18723,2019-03-30 07:43:20,2.360835
2,WM_34552,2019-08-10 11:33:20,3.216631
3,WM_28570,2019-06-26 03:53:20,7.135807
4,WM_36934,2019-08-27 16:43:20,3.556352
...,...,...,...
12081,WM_13376,2019-02-12 11:33:20,6.774065
12082,WM_1630,2018-11-12 17:33:20,3.269932
12083,WM_24703,2019-05-27 11:53:20,5.397360
12084,WM_22893,2019-05-13 21:53:20,5.464162


In [394]:
extra.to_csv('submission.csv',header = True,index = False)