In [1]:
import pandas as pd
import numpy as np

In [2]:
data = pd.read_csv("austin_weather.csv")

In [3]:
data = data.drop(['Events', 'Date', 'SeaLevelPressureHighInches', 'SeaLevelPressureLowInches'], axis = 1)

In [6]:
data.head()

Unnamed: 0,TempHighF,TempAvgF,TempLowF,DewPointHighF,DewPointAvgF,DewPointLowF,HumidityHighPercent,HumidityAvgPercent,HumidityLowPercent,SeaLevelPressureAvgInches,VisibilityHighMiles,VisibilityAvgMiles,VisibilityLowMiles,WindHighMPH,WindAvgMPH,WindGustMPH,PrecipitationSumInches
0,74,60,45,67,49,43,93,75,57,29.68,10,7,2,20,4,31,0.46
1,56,48,39,43,36,28,93,68,43,30.13,10,10,5,16,6,25,0.0
2,58,45,32,31,27,23,76,52,27,30.49,10,10,10,8,3,12,0.0
3,61,46,31,36,28,21,89,56,22,30.45,10,10,7,12,4,20,0.0
4,58,50,41,44,40,36,86,71,56,30.33,10,10,7,10,2,16,0.0


In [5]:
# some values have 'T' which denotes trace rainfall, we need to replace all occurrences of T with 0
# so that we can use the data in our model

data = data.replace('T', 0.0)

In [7]:
# data also contain '-'  where values are nill, we also need to remove them
data = data.replace('-', 0.0)

In [8]:
# save the data in a csv file
data.to_csv('austin_final.csv')

In [9]:
# reading the cleaned data again
data = pd.read_csv("austin_final.csv")

In [10]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,TempHighF,TempAvgF,TempLowF,DewPointHighF,DewPointAvgF,DewPointLowF,HumidityHighPercent,HumidityAvgPercent,HumidityLowPercent,SeaLevelPressureAvgInches,VisibilityHighMiles,VisibilityAvgMiles,VisibilityLowMiles,WindHighMPH,WindAvgMPH,WindGustMPH,PrecipitationSumInches
0,0,74,60,45,67.0,49.0,43.0,93.0,75.0,57.0,29.68,10.0,7.0,2.0,20.0,4.0,31.0,0.46
1,1,56,48,39,43.0,36.0,28.0,93.0,68.0,43.0,30.13,10.0,10.0,5.0,16.0,6.0,25.0,0.0
2,2,58,45,32,31.0,27.0,23.0,76.0,52.0,27.0,30.49,10.0,10.0,10.0,8.0,3.0,12.0,0.0


In [11]:
# here rain has been represented by the term precipitation, it will serve as the label column
X = data.drop(['PrecipitationSumInches'], axis = 1)
y= data['PrecipitationSumInches']

In [12]:
y= y.values.reshape(-1, 1)

In [13]:
# consider a random day in the dataset we shall plot a graph and observe this day

day_index = 798
days = [i for i in range(y.size)]

In [15]:
from sklearn.linear_model import LinearRegression

In [16]:
model3 = LinearRegression()

In [17]:
model3.fit(X, y)

LinearRegression()

In [18]:
y_pred=model3.predict(X)

In [19]:
y_pred

array([[ 0.87119778],
       [ 0.01445548],
       [-0.25299718],
       ...,
       [-0.01400966],
       [ 0.05332136],
       [ 0.00446742]])

In [20]:
import pickle

In [22]:
# Save the trained model as a pickle string.
saved_model = pickle.dumps(model3)
 
# Load the pickled model
model3_from_pickle = pickle.loads(saved_model)
 
# Use the loaded pickled model to make predictions
model3_from_pickle.predict(X)

array([[ 0.87119778],
       [ 0.01445548],
       [-0.25299718],
       ...,
       [-0.01400966],
       [ 0.05332136],
       [ 0.00446742]])

In [24]:
# or we can use joblib to save our model

import joblib
 
# Save the model as a pickle in a file
joblib.dump(model3, 'model3.pkl')
 
# Load the model from the file
rainpred_from_joblib = joblib.load('model3.pkl')
 
# Use the loaded model to make predictions
rainpred_from_joblib.predict(X)

array([[ 0.87119778],
       [ 0.01445548],
       [-0.25299718],
       ...,
       [-0.01400966],
       [ 0.05332136],
       [ 0.00446742]])