In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
def import_data(relative_path):
    working_dir = os.path.dirname('clean-data.ipynb')
    path_to_file = os.path.join(working_dir, relative_path)
    
    return pd.read_csv(path_to_file)

In [3]:
DATAPATH = 'data-BerkeleyCA/pv_data.csv'
EXPORT_DIR = 'data-CLEAN'

pd.set_option('display.max_columns', None)
pd.set_option('display.expand_frame_repr', False)

In [4]:
rawdata = import_data(DATAPATH)

In [5]:
rawdata.ndim

2

In [6]:
rawdata.shape

(2920, 16)

In [7]:
rawdata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2920 entries, 0 to 2919
Data columns (total 16 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   Day of Year                           2920 non-null   int64  
 1   Year                                  2920 non-null   int64  
 2   Month                                 2920 non-null   int64  
 3   Day                                   2920 non-null   int64  
 4   First Hour of Period                  2920 non-null   int64  
 5   Is Daylight                           2920 non-null   bool   
 6   Distance to Solar Noon                2920 non-null   float64
 7   Average Temperature (Day)             2920 non-null   int64  
 8   Average Wind Direction (Day)          2920 non-null   int64  
 9   Average Wind Speed (Day)              2920 non-null   float64
 10  Sky Cover                             2920 non-null   int64  
 11  Visibility       

In [8]:
rawdata.isna().any().any()

True

In [9]:
rawdata.isna().sum().sum()

1

In [10]:
rawdata.isna().any()

Day of Year                             False
Year                                    False
Month                                   False
Day                                     False
First Hour of Period                    False
Is Daylight                             False
Distance to Solar Noon                  False
Average Temperature (Day)               False
Average Wind Direction (Day)            False
Average Wind Speed (Day)                False
Sky Cover                               False
Visibility                              False
Relative Humidity                       False
Average Wind Speed (Period)              True
Average Barometric Pressure (Period)    False
Power Generated                         False
dtype: bool

In [11]:
rawdata.index[rawdata.isna().any(axis=1)]

Int64Index([714], dtype='int64')

In [12]:
rawdata.loc[713]

Day of Year                                  334
Year                                        2008
Month                                         11
Day                                           29
First Hour of Period                           4
Is Daylight                                False
Distance to Solar Noon                  0.713311
Average Temperature (Day)                     57
Average Wind Direction (Day)                  12
Average Wind Speed (Day)                     2.8
Sky Cover                                      4
Visibility                                   4.0
Relative Humidity                             96
Average Wind Speed (Period)                  5.0
Average Barometric Pressure (Period)       30.17
Power Generated                                0
Name: 713, dtype: object

In [13]:
rawdata.loc[714]

Day of Year                                  334
Year                                        2008
Month                                         11
Day                                           29
First Hour of Period                           7
Is Daylight                                 True
Distance to Solar Noon                  0.406143
Average Temperature (Day)                     57
Average Wind Direction (Day)                  12
Average Wind Speed (Day)                     2.8
Sky Cover                                      2
Visibility                                   4.0
Relative Humidity                             96
Average Wind Speed (Period)                  NaN
Average Barometric Pressure (Period)       30.16
Power Generated                              159
Name: 714, dtype: object

In [14]:
rawdata.loc[715]

Day of Year                                  334
Year                                        2008
Month                                         11
Day                                           29
First Hour of Period                          10
Is Daylight                                 True
Distance to Solar Noon                  0.098976
Average Temperature (Day)                     57
Average Wind Direction (Day)                  12
Average Wind Speed (Day)                     2.8
Sky Cover                                      3
Visibility                                   7.0
Relative Humidity                             80
Average Wind Speed (Period)                  8.0
Average Barometric Pressure (Period)       30.19
Power Generated                             7229
Name: 715, dtype: object

In [15]:
mean_wind_speed = (rawdata.loc[713, 'Average Wind Speed (Period)'] + rawdata.loc[715, 'Average Wind Speed (Period)']) / 2
rawdata.loc[714, 'Average Wind Speed (Period)'] = mean_wind_speed

In [16]:
rawdata.isna().any().any()

False

In [17]:
y = rawdata['Power Generated']
y.shape

(2920,)

In [18]:
x = rawdata.drop(columns='Power Generated')
x.shape

(2920, 15)

In [19]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42)

In [20]:
x_train.head()

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period)
2651,210,2009,7,29,10,True,0.16,65,27,13.6,1,10.0,68,14.0,29.88
1709,92,2009,4,2,16,True,0.298292,54,27,25.1,2,10.0,64,38.0,29.89
869,353,2008,12,18,16,True,0.511344,47,20,8.7,4,10.0,69,11.0,30.14
2355,173,2009,6,22,10,True,0.148816,67,29,10.7,0,10.0,70,5.0,29.83
2915,243,2009,8,31,10,True,0.166453,63,27,13.9,4,10.0,75,10.0,29.93


In [21]:
y_train.head()

2651    10750
1709    18575
869      3407
2355    29010
2915     6995
Name: Power Generated, dtype: int64

In [22]:
x_test.head()

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period)
2437,183,2009,7,2,16,True,0.255946,64,29,12.3,1,10.0,59,21.0,29.86
2470,187,2009,7,6,19,True,0.459705,62,27,15.8,1,10.0,72,18.0,29.97
2359,173,2009,6,22,22,False,0.662909,67,29,10.7,0,10.0,72,13.0,29.75
789,343,2008,12,8,16,True,0.515571,50,33,4.9,2,10.0,71,0.0,30.07
1642,84,2009,3,25,7,True,0.425876,55,27,16.1,1,10.0,86,5.0,30.16


In [23]:
y_test.head()

2437    21804
2470     2065
2359        0
789      5654
1642     3621
Name: Power Generated, dtype: int64

In [24]:
train_data = x_train.merge(y_train, left_index=True, right_index=True)
train_data.head(10)

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated
2651,210,2009,7,29,10,True,0.16,65,27,13.6,1,10.0,68,14.0,29.88,10750
1709,92,2009,4,2,16,True,0.298292,54,27,25.1,2,10.0,64,38.0,29.89,18575
869,353,2008,12,18,16,True,0.511344,47,20,8.7,4,10.0,69,11.0,30.14,3407
2355,173,2009,6,22,10,True,0.148816,67,29,10.7,0,10.0,70,5.0,29.83,29010
2915,243,2009,8,31,10,True,0.166453,63,27,13.9,4,10.0,75,10.0,29.93,6995
1561,74,2009,3,15,4,False,0.695955,55,18,8.2,4,6.0,93,5.0,30.04,0
605,320,2008,11,15,16,True,0.504119,69,14,5.0,1,10.0,27,0.0,30.05,6614
2817,231,2009,8,19,4,False,0.610149,65,30,8.6,4,10.0,87,6.0,29.82,0
911,358,2008,12,23,22,False,1.136126,47,13,3.6,4,10.0,77,6.0,29.89,0
1395,53,2009,2,22,10,True,0.124438,56,15,9.4,4,4.0,93,7.0,29.99,133


In [25]:
train_data.shape

(2336, 16)

In [26]:
test_data = x_test.merge(y_test, left_index=True, right_index=True)
test_data.head(10)

Unnamed: 0,Day of Year,Year,Month,Day,First Hour of Period,Is Daylight,Distance to Solar Noon,Average Temperature (Day),Average Wind Direction (Day),Average Wind Speed (Day),Sky Cover,Visibility,Relative Humidity,Average Wind Speed (Period),Average Barometric Pressure (Period),Power Generated
2437,183,2009,7,2,16,True,0.255946,64,29,12.3,1,10.0,59,21.0,29.86,21804
2470,187,2009,7,6,19,True,0.459705,62,27,15.8,1,10.0,72,18.0,29.97,2065
2359,173,2009,6,22,22,False,0.662909,67,29,10.7,0,10.0,72,13.0,29.75,0
789,343,2008,12,8,16,True,0.515571,50,33,4.9,2,10.0,71,0.0,30.07,5654
1642,84,2009,3,25,7,True,0.425876,55,27,16.1,1,10.0,86,5.0,30.16,3621
196,269,2008,9,25,13,True,0.081944,67,29,9.3,2,10.0,47,15.0,29.91,27072
1543,71,2009,3,12,22,False,0.819464,51,28,9.5,1,10.0,86,13.0,30.15,0
1270,37,2009,2,6,19,True,0.72381,53,13,7.5,4,10.0,86,6.0,29.8,0
2473,188,2009,7,7,4,True,0.56314,62,27,12.9,4,10.0,80,9.0,29.93,0
2073,138,2009,5,18,4,True,0.565774,59,29,15.6,2,10.0,90,10.0,29.87,0


In [27]:
train_data_path = os.path.join(EXPORT_DIR, 'train_data.csv')
test_data_path = os.path.join(EXPORT_DIR, 'test_data.csv')

In [28]:
print(train_data_path)
print(test_data_path)

data-CLEAN/train_data.csv
data-CLEAN/test_data.csv


In [29]:
train_data.to_csv(train_data_path, index=False)
test_data.to_csv(test_data_path, index=False)