In [37]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

In [38]:
data = pd.read_csv("../data/kar.csv",sep=",")
data = data.fillna(data.mean())
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 345 entries, 0 to 344
Data columns (total 19 columns):
SUBDIVISION    345 non-null object
YEAR           345 non-null int64
JAN            345 non-null float64
FEB            345 non-null float64
MAR            345 non-null float64
APR            345 non-null float64
MAY            345 non-null float64
JUN            345 non-null float64
JUL            345 non-null float64
AUG            345 non-null float64
SEP            345 non-null float64
OCT            345 non-null float64
NOV            345 non-null float64
DEC            345 non-null float64
ANNUAL         345 non-null float64
Jan-Feb        345 non-null float64
Mar-May        345 non-null float64
Jun-Sep        345 non-null float64
Oct-Dec        345 non-null float64
dtypes: float64(17), int64(1), object(1)
memory usage: 51.3+ KB


In [39]:
print(data.head())

         SUBDIVISION  YEAR  JAN  FEB   MAR   APR    MAY     JUN     JUL  \
0  COASTAL KARNATAKA  1901  1.8  0.6  10.7  52.4   81.6   960.9   991.2   
1  COASTAL KARNATAKA  1902  3.2  0.3   4.9  10.2   54.6   698.4  1401.6   
2  COASTAL KARNATAKA  1903  0.7  0.0   0.0   4.1  202.8   536.5  1405.5   
3  COASTAL KARNATAKA  1904  2.4  0.0   4.8  23.7   93.2  1108.2  1070.0   
4  COASTAL KARNATAKA  1905  0.0  0.2   0.0   6.4   83.1   767.3   777.3   

     AUG    SEP    OCT    NOV    DEC  ANNUAL  Jan-Feb  Mar-May  Jun-Sep  \
0  606.4  108.0  120.5  104.9   17.8  3056.9      2.4    144.7   2666.6   
1  454.2  708.4  180.4   50.8  132.2  3699.2      3.5     69.7   3262.6   
2  593.8  304.4  185.0   79.3    5.3  3317.4      0.7    206.9   2840.1   
3  465.6  245.3  127.2    0.7    0.0  3141.1      2.4    121.7   2889.1   
4  586.9  172.9  222.2   36.1    0.0  2652.3      0.2     89.4   2304.3   

   Oct-Dec  
0    243.3  
1    363.5  
2    269.6  
3    127.9  
4    258.4  


In [40]:
print(data.describe())

              YEAR         JAN         FEB         MAR         APR  \
count   345.000000  345.000000  345.000000  345.000000  345.000000   
mean   1958.000000    2.628488    2.951304    7.655362   32.499420   
std      33.244601    5.145426    6.140374   14.510653   22.164135   
min    1901.000000    0.000000    0.000000    0.000000    0.000000   
25%    1929.000000    0.000000    0.000000    0.600000   15.700000   
50%    1958.000000    0.300000    0.300000    3.500000   27.100000   
75%    1987.000000    2.600000    2.700000    8.500000   43.800000   
max    2015.000000   28.500000   44.300000  161.400000  127.700000   

              MAY          JUN          JUL          AUG         SEP  \
count  345.000000   345.000000   345.000000   345.000000  345.000000   
mean    87.307826   361.245507   498.973043   335.772174  193.302319   
std     83.063360   359.645261   480.124823   305.475266  122.724962   
min      3.500000    38.200000    37.000000    28.000000   27.700000   
25%     4

In [41]:
def plot_graphs(groundtruth,prediction,title):        
    N = 9
    ind = np.arange(N)  # the x locations for the groups
    width = 0.27       # the width of the bars

    fig = plt.figure()
    fig.suptitle(title, fontsize=12)
    ax = fig.add_subplot(111)
    rects1 = ax.bar(ind, groundtruth, width, color='r')
    rects2 = ax.bar(ind+width, prediction, width, color='g')

    ax.set_ylabel("Amount of rainfall")
    ax.set_xticks(ind+width)
    ax.set_xticklabels( ('APR', 'MAY', 'JUN', 'JUL','AUG', 'SEP', 'OCT', 'NOV', 'DEC') )
    ax.legend( (rects1[0], rects2[0]), ('Ground truth', 'Prediction') )

#     autolabel(rects1)
    for rect in rects1:
        h = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., 1.05*h, '%d'%int(h),
                ha='center', va='bottom')
    for rect in rects2:
        h = rect.get_height()
        ax.text(rect.get_x()+rect.get_width()/2., 1.05*h, '%d'%int(h),
                ha='center', va='bottom')
#     autolabel(rects2)

    plt.show()

In [42]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

division_data = np.asarray(data[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
       'AUG', 'SEP', 'OCT', 'NOV', 'DEC']])

X = None; y = None
for i in range(division_data.shape[1]-3):
    if X is None:
        X = division_data[:, i:i+3]
        y = division_data[:, i+3]
    else:
        X = np.concatenate((X, division_data[:, i:i+3]), axis=0)
        y = np.concatenate((y, division_data[:, i+3]), axis=0)
        
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [43]:
temp = data[['SUBDIVISION','JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
       'AUG', 'SEP', 'OCT', 'NOV', 'DEC']].loc[data['YEAR'] == 2015]

data = np.asarray(temp[['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL',
       'AUG', 'SEP', 'OCT', 'NOV', 'DEC']].loc[temp['SUBDIVISION'] == 'KARNATAKA'])

X_year = None; y_year = None
for i in range(data.shape[1]-3):
    if X_year is None:
        X_year = data[:, i:i+3]
        y_year = data[:, i+3]
    else:
        X_year = np.concatenate((X_year, data[:, i:i+3]), axis=0)
        y_year = np.concatenate((y_year, data[:, i+3]), axis=0)

In [44]:
from sklearn.svm import SVR

# SVM model
clf = SVR(gamma='auto', C=0.1, epsilon=0.2)
clf.fit(X_train, y_train) 
y_pred = clf.predict(X_test)
print(mean_absolute_error(y_test, y_pred))

136.92362419434863


In [45]:
from keras.models import Model
from keras.layers import Dense, Input, Conv1D, Flatten

# NN model
inputs = Input(shape=(3,1))
x = Conv1D(64, 2, padding='same', activation='elu')(inputs)
x = Conv1D(128, 2, padding='same', activation='elu')(x)
x = Flatten()(x)
x = Dense(128, activation='elu')(x)
x = Dense(64, activation='elu')(x)
x = Dense(32, activation='elu')(x)
x = Dense(1, activation='linear')(x)
model = Model(inputs=[inputs], outputs=[x])
model.compile(loss='mean_squared_error', optimizer='adamax', metrics=['mae'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 3, 1)              0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 3, 64)             192       
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 3, 128)            16512     
_________________________________________________________________
flatten_2 (Flatten)          (None, 384)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 128)               49280     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 32)                2080      
__________

In [46]:
model.fit(x=np.expand_dims(X_train, axis=2), y=y_train, batch_size=64, epochs=10, verbose=1, validation_split=0.1, shuffle=True)
y_pred = model.predict(np.expand_dims(X_test, axis=2))
print(mean_absolute_error(y_test, y_pred))

Train on 2514 samples, validate on 280 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
94.21667860328576
