In [4]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as nppip
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    strategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    strategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [13]:
!kaggle datasets download -d hyw200199/beijing-air-polution

In [5]:
from kaggle_datasets import KaggleDatasets
GCS_DS_PATH = KaggleDatasets().get_gcs_path('hyw200199/beijing-air-polution') # you can list the bucket with "!gsutil ls $GCS_DS_PATH"
!gsutil ls $GCS_DS_PATH

In [6]:
#3. Read file as panda dataframe
import pandas as pd  # 数据分析

data = pd.read_csv("../input/beijing-air-polution/beijing_air_2010.1.1-2014.12.31.csv")
data

In [7]:
data.info()

In [8]:
data.head()

In [9]:
data.tail()

In [10]:
data.columns

In [19]:
data['pm2.5'].isna()

In [11]:
data['pm2.5'].isna().sum()

In [12]:
data=data.iloc[24:].fillna(method='ffill')

In [50]:
data.info

In [51]:
data.drop('No',axis=1,inplace=True)
data.info

In [13]:
data['pm2.5'].isna().sum()

In [14]:
import datetime
datetime.datetime(year=2022,month=1,day=1,hour=1)

In [15]:
data['tm']=data.apply(lambda x:datetime.datetime( year=x['year'],
                    month=x['month'],
                    day=x['day'],
                    hour=x['hour']),
                    axis=1 #axis=1表示对行进行该操作
)

In [16]:
data.head()

In [17]:
data=data.set_index('tm') #将tm设为索引

In [18]:
data

In [52]:

data.drop(columns=['year', 'month', 'day', 'hour'], inplace=True)
data.info

## 分界线

In [53]:
data_=[]
for i in range(len(data)-seq_length-delay):
  data_.append(data.iloc[i:i+seq_length+delay])
import numpy as np

data_=np.array([df.values for df in data_])
print("data_shape:"+str(data_.shape))
np.random.shuffle(data_)
x=data_[ : ,:5*24 ,: ]
y=data_[ : ,-1 ,0]
split_b=int(data_.shape[0]*0.8)

train_x=x[:split_b]
train_y=y[:split_b]
test_x=x[split_b:]
test_y=y[split_b:]

print("train :"+str(train_x.shape))
print("test  :"+str(test_x.shape))

mean = train_x.mean(axis=0)
std = train_x.std(axis=0)

train_x=(train_x-mean)/std
test_x=(test_x-mean)/std


In [54]:

batch_size=128
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.Flatten(input_shape=(train_x.shape[1:])))
    model.add(layers.Dense(32,activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',
           loss='mae')
           # metrics=['mae'])
    history=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=50,
              validation_data=(test_x,test_y)
                     )



In [55]:
plt.plot(history.epoch,history.history.get('loss'),'y',label='train loss')
plt.plot(history.epoch,history.history.get('val_loss'),'b',label='test loss')
plt.legend()

In [56]:
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.LSTM(32,input_shape=(train_x.shape[1:])))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',
           loss='mae')
           # metrics=['mae'])
    history=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=50,
              validation_data=(test_x,test_y)
                     )



In [57]:
plt.plot(history.epoch,history.history.get('loss'),'y',label='train loss')
plt.plot(history.epoch,history.history.get('val_loss'),'b',label='test loss')
plt.legend()

In [61]:
with tpustrategy.scope():
    model2=keras.Sequential()
    model2.add(layers.LSTM(32,input_shape=(train_x.shape[1:]),return_sequences=True))
    model2.add(layers.LSTM(32,return_sequences=True))
    model2.add(layers.LSTM(32))
    model2.add(layers.Dense(1))
    model2.compile(optimizer='adam',
           loss='mae')
           # metrics=['mae'])
        
    lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=0.00001)

    history2=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=50,
              validation_data=(test_x,test_y),callbacks=[lr]
                     )



In [63]:
plt.plot(history2.epoch,history2.history.get('loss'),'y',label='train loss')
plt.plot(history2.epoch,history2.history.get('val_loss'),'b',label='test loss')
plt.legend()

In [58]:

batch_size=128
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.Flatten(input_shape=(train_x.shape[1:])))
    model.add(layers.Dense(32,activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',
           loss='mae')
           # metrics=['mae'])
    history=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=200,
              validation_data=(test_x,test_y)
                     )



In [60]:
plt.plot(history.epoch,history.history.get('loss'),'r',label='train loss')
plt.plot(history.epoch,history.history.get('val_loss'),'b',label='test loss')
plt.legend()

In [65]:
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.LSTM(32,input_shape=(train_x.shape[1:]),return_sequences=True))
    model.add(layers.LSTM(32,return_sequences=True))
    model.add(layers.LSTM(32))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',
           loss='mae')
           # metrics=['mae'])
        
    lr = keras.callbacks.ReduceLROnPlateau(monitor='val_loss', patience=3, factor=0.5, min_lr=0.00001)

    history=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=200,
              validation_data=(test_x,test_y),callbacks=[lr]
                     )



In [66]:
plt.plot(history.epoch,history.history.get('loss'),'r',label='train loss')
plt.plot(history.epoch,history.history.get('val_loss'),'b',label='test loss')
plt.legend()

In [67]:
model.save('pm2.5_v2.h5')

In [68]:
model.evaluate(test_x,test_y,verbose=0)

In [69]:
pre_test=model.predict(test_x)
test_x.shape , pre_test.shape

## 分界线

In [19]:
# 观察注意到cbwd是python对象，有几种取值，我们要把它数值化
data.cbwd.unique()

In [20]:
data=data.join(pd.get_dummies(data.cbwd))

In [21]:
data

In [22]:
del data['cbwd']
data

In [23]:
# 看看最后1000次pm2.5的变化
data['pm2.5'][-1000:].plot()


In [24]:
# 看看最后1000次温度的变化
data['TEMP'][-1000:].plot()

In [25]:
# 设置5天为预测所需要的数据长度
# 设置1天为要预测的数据
seq_length=24*5
delay=24*1


In [26]:
data_=[]
for i in range(len(data)-seq_length-delay):
  data_.append(data.iloc[i:i+seq_length+delay])

In [27]:
data_[0].shape[0]

In [33]:
import numpy as np

In [34]:

data_=np.array([df.values for df in data_])

In [35]:
data_.shape

In [36]:
np.random.shuffle(data_)
x=data_[ : ,:5*24 ,: ]
y=data_[ : ,-1 ,0]
split_b=int(data_.shape[0]*0.8)

train_x=x[:split_b]
train_y=y[:split_b]
test_x=x[split_b:]
test_y=y[split_b:]

train_x.shape , train_y.shape

In [37]:
train_x.shape , train_y.shape

In [38]:
test_x.shape , test_y.shape

In [39]:
mean = train_x.mean(axis=0)
std = train_x.std(axis=0)

train_x=(train_x-mean)/std
test_x=(test_x-mean)/std


In [40]:
# Detect hardware, return appropriate distribution strategy
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection. No parameters necessary if TPU_NAME environment variable is set. On Kaggle this is always the case.
    print('Running on TPU ', tpu.master())
except ValueError:
    tpu = None

if tpu:
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpustrategy = tf.distribute.experimental.TPUStrategy(tpu)
else:
    cpustrategy = tf.distribute.get_strategy() # default distribution strategy in Tensorflow. Works on CPU and single GPU.

print("REPLICAS: ", strategy.num_replicas_in_sync)

In [41]:

batch_size=128
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.Flatten(input_shape=(train_x.shape[1:])))
    model.add(layers.Dense(32,activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',
           loss='mse',
           metrics=['mae'])
    history=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=100,
              validation_data=(test_x,test_y))



In [42]:
plt.plot(history.epoch,history.history['val_loss'])

In [43]:
plt.plot(history.epoch,history.history['val_mae'])

In [None]:
plt.plot(history.epoch,history.history['val_loss'])
plt.plot(history.epoch,history.history['val_mae'])

In [44]:
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.Flatten(input_shape=(train_x.shape[1:])))
    model.add(layers.Dense(32,activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',
       loss='mse',
       metrics=['mae'])
history=model.fit(train_x,train_y,
          batch_size=batch_size,
          epochs=100,
          validation_data=(test_x,test_y))

In [47]:
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.Flatten(input_shape=(train_x.shape[1:])))
    model.add(layers.Dense(32,activation='relu'))
    model.add(layers.Dense(1))
    model.compile(optimizer='adam',
       loss='mse',
       metrics=['mae'])
history=model.fit(train_x,train_y,
          batch_size=batch_size,
          epochs=10,
          validation_data=(test_x,test_y))

In [52]:
!pip install --upgrade keras


In [44]:
import tensorflow.keras.callbacks
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [45]:
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.LSTM(32,input_shape=(120,16),return_sequences=True))
    model.add(layers.LSTM(32,return_sequences=True))
    model.add(layers.LSTM(32,return_sequences=True))
    model.add(layers.LSTM(32))
    model.add(layers.Dense(1))
    # ReduceLROnPlateau
    # ReduceLROnPlateau
    lr_reduce=ReduceLROnPlateau('val_loss',patience=10,factor=0.4,min_lr=0.0001)
    # 设置在训练过程中降低学习速率，如果在10个epoch内无变化，那么学习速率就要乘以0.4，但是不能小于min_lr

    model.compile(optimizer='adam',
           loss='mse',
           metrics=['mae'])

    history=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=10,
              callbacks=[lr_reduce],
              validation_data=(test_x,test_y))


In [46]:
history.history.keys()

In [47]:
plt.plot(history.epoch,history.history['val_loss'])

In [48]:

plt.plot(history.epoch,history.history['val_mae'])

In [49]:
plt.plot(history.epoch,history.history['lr'])


In [69]:
from kaggle_datasets import KaggleDatasets
GCS_PATH = KaggleDatasets().get_gcs_path('beijing-air-polution')
print(GCS_PATH)
#路径中间加连字符


In [70]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
user_credential = user_secrets.get_gcloud_credential()

# Step 2: Set the credentials
user_secrets.set_tensorflow_credential(user_credential)

# Step 3: Use a familiar call to get the GCS path of the dataset
from kaggle_datasets import KaggleDatasets
GCS_DS_PATH = KaggleDatasets().get_gcs_path()

In [71]:
with tpustrategy.scope():
    model=keras.Sequential()
    model.add(layers.LSTM(32,input_shape=(120,16),return_sequences=True))
    model.add(layers.LSTM(32,return_sequences=True))
    model.add(layers.LSTM(32,return_sequences=True))
    model.add(layers.LSTM(32,return_sequences=True))
    model.add(layers.LSTM(32))
    model.add(layers.Dense(1))
    # ReduceLROnPlateau
    # ReduceLROnPlateau
    lr_reduce=ReduceLROnPlateau('val_loss',patience=17,factor=0.4,min_lr=0.0001)
    # 设置在训练过程中降低学习速率，如果在3个epoch内无变化，那么学习速率就要乘以0.4，但是不能小于min_lr

    model.compile(optimizer='adam',
           loss='mse',
           metrics=['mae'])

    history=model.fit(train_x,train_y,
              batch_size=batch_size,
              epochs=200,
              callbacks=[lr_reduce],
              validation_data=(test_x,test_y))


In [72]:
plt.plot(history.epoch,history.history['val_mae'])

In [73]:
plt.plot(history.epoch,history.history['mae'])

In [75]:
plt.plot(history.epoch,history.history['lr'])

In [2]:
plt.plot(history.epoch,history.history['loss'])

In [3]:
history.history.keys()