In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import OneHotEncoder,PowerTransformer, StandardScaler, MinMaxScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge,Lasso
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split,RandomizedSearchCV, GridSearchCV
import pickle

In [18]:
df = pd.read_csv('forest_outliers_removed1')
df.drop(columns=['Unnamed: 0'], inplace=True)
df['classes']=df['classes'].apply(lambda x :1 if x == 'fire' else 0)

In [19]:
df.head()

Unnamed: 0,day,month,year,Temperature,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,classes
0,1,6,2012,29,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0
1,2,6,2012,29,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0
2,3,6,2012,26,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0
3,4,6,2012,25,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0
4,5,6,2012,27,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0


In [20]:
X = df.drop(columns=['Temperature','year'],axis=1)
y = df['Temperature']

In [21]:
X.head()

Unnamed: 0,day,month,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,classes
0,1,6,57,18,0.0,65.7,3.4,7.6,1.3,3.4,0.5,0
1,2,6,61,13,1.3,64.4,4.1,7.6,1.0,3.9,0.4,0
2,3,6,82,22,13.1,47.1,2.5,7.1,0.3,2.7,0.1,0
3,4,6,89,13,2.5,28.6,1.3,6.9,0.0,1.7,0.0,0
4,5,6,77,16,0.0,64.8,3.0,14.2,1.2,3.9,0.5,0


In [22]:
y.head()

0    29
1    29
2    26
3    25
4    27
Name: Temperature, dtype: int64

In [23]:
scale = ColumnTransformer(transformers=[
    ('scale', PowerTransformer(),slice(0,11))
],remainder='passthrough')
tnf = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(sparse=False, drop= 'first'),[11])
],remainder='passthrough')

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=256)
pipe = Pipeline([
        ('scale',scale),
        ('model',RandomForestRegressor(random_state=96))
    ])
pipe.fit(X_train.values, y_train.values)
y_pred = pipe.predict(X_test.values)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6891599593234852
MAE 1.6516374269005845


In [25]:
pipe = Pipeline([
    ('scale',scale),
    ('model',RandomForestRegressor(random_state=96,n_jobs=-1))
])
# Number of trees in random forest
n_estimators = [100, 200, 300, 1000]
# Maximum number of levels in tree
max_depth = [80, 90, 100, 110]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [8, 10, 12],
# Minimum number of samples required at each leaf node
min_samples_leaf =[3, 4, 5]
# Method of selecting samples for training each tree
criterion =['mse', 'mae']
max_features=[2,3]
# Create the random grid
random_grid = {'model__n_estimators': n_estimators,
               'model__max_depth': max_depth,
               'model__min_samples_split': min_samples_split,
               'model__min_samples_leaf': min_samples_leaf,
               'model__max_features': max_features 
            }

In [26]:
gs = GridSearchCV(estimator=pipe, param_grid=random_grid, n_jobs=-1)

In [27]:
### Linear Regression

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5767218435130868
MAE 1.8027155958860517


In [69]:
X_train.head()

Unnamed: 0,day,month,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,classes
54,25,7,65,18,0.0,84.3,12.5,88.7,4.8,18.5,7.3,1
41,12,7,75,13,0.1,75.1,7.9,27.7,1.5,9.2,0.9,0
238,25,9,70,15,0.0,79.9,13.8,36.1,2.4,14.1,3.0,0
199,17,8,24,9,0.0,96.0,30.3,76.4,15.7,30.4,24.0,1
206,24,8,38,15,0.0,92.1,51.3,147.7,12.2,54.9,26.9,1


In [71]:
y_train.head()

225    31
9      28
208    33
171    36
112    31
Name: Temperature, dtype: int64

In [29]:
### Ridge Regression

In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=198)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',LinearRegression(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5767218435130868
MAE 1.8027155958860517


In [31]:
lambdas=np.linspace(1,100,100)
params={'alpha':lambdas}
grid_search=GridSearchCV(pipe,param_grid=params,cv=10,)

In [32]:
### Lasso Regression

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=728)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',Lasso(fit_intercept=True))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5133180180558128
MAE 1.9174982080033702


In [34]:
### SVR

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=480)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',SVR(C=1,kernel='linear',gamma='auto',max_iter=5e4))
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.6006623453002449
MAE 1.786610542310271


In [36]:
rf_params = {
    'C': [1,10, 100],
    "kernel":['poly','rbf','sigmoid'],
    "epsilon":[0.01,0.1,1]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10,)

In [37]:
### KNeighborsRegressor

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=458)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',KNeighborsRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.556369026239775
MAE 1.8923976608187136


In [39]:
rf_params = {
    'n_neighbors': [2, 3, 5, 7, 10]
}
grid_search=GridSearchCV(pipe,param_grid=rf_params,cv=10,)

In [40]:
## DecisionTreeRegressor

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.3, random_state=657)
pipe = Pipeline([
        ('tnf',tnf),
        ('scale',scale),
        ('model',DecisionTreeRegressor())
    ])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
print('R2 SCORE', r2_score(y_test,y_pred))
print('MAE', mean_absolute_error(y_test,y_pred))

R2 SCORE 0.5864063376332025
MAE 1.8245614035087718


In [42]:
results = pd.DataFrame({
    'Model': ['Linear Regression','Lasso Regression', 'Ridge Regression','SVR' ,'Decision Tree','Random Forest'],
    'Score': [0.63,0.51,0.63,0.61,0.56,0.70]})

result_df = results.sort_values(by='Score', ascending=False)
result_df = result_df.set_index('Score')
result_df.head(9)

Unnamed: 0_level_0,Model
Score,Unnamed: 1_level_1
0.7,Random Forest
0.63,Linear Regression
0.63,Ridge Regression
0.61,SVR
0.56,Decision Tree
0.51,Lasso Regression


In [43]:
### Creating pickle file
pickle.dump(pipe,open('models/pipe_reg1.pkl','wb'))

In [44]:
### Preparing data to create batch prediction
import json
result = X_test.to_json(orient="records")
parsed = json.loads(result)

In [45]:
# cREATING DATABASE CONNCETION
import pymongo
client = pymongo.MongoClient("mongodb+srv://mongodb:mongodb@cluster0.4frquud.mongodb.net/?retryWrites=true&w=majority")

In [46]:
db = client.get_database('batch_data')
print(db)

Database(MongoClient(host=['ac-ate4ei4-shard-00-02.4frquud.mongodb.net:27017', 'ac-ate4ei4-shard-00-00.4frquud.mongodb.net:27017', 'ac-ate4ei4-shard-00-01.4frquud.mongodb.net:27017'], document_class=dict, tz_aware=False, connect=True, retrywrites=True, w='majority', authsource='admin', replicaset='atlas-z9dmvo-shard-0', tls=True), 'batch_data')


In [47]:
coll = db['regression_batch_data']

In [48]:
db.list_collection_names()

['classification_batch', 'regression_batch_data']

In [49]:
coll.insert_many(parsed)

<pymongo.results.InsertManyResult at 0x7f9b18b044a8>

In [50]:
db.list_collection_names()

['classification_batch', 'regression_batch_data']

In [51]:
# Testing created Pipe
pickle_model = pickle.load(open('models/pipe_reg1.pkl','rb'))

In [52]:
test_input = np.array([1, 6, 26, 57, 18.0, 0.0, 65.7, 3.4, 7.6, 1.3, 3.4,11 ]).reshape(1,12)
test_input.shape

(1, 12)

In [63]:
X_test[0:1]

Unnamed: 0,day,month,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,classes
54,25,7,65,18,0.0,84.3,12.5,88.7,4.8,18.5,7.3,1


In [59]:
y_test.shape

(171,)

In [100]:
X_test.shape

(171, 12)

In [74]:
type(X_test)

pandas.core.frame.DataFrame

In [105]:
dict_test ={'day':1, 'month':6, 'Temperature':26,'RH':57, 'Ws':18.0, 'Rain':0.00, 'FFMC':65.7000,
       'DMC':3.4, 'DC':7.6, 'ISI':1.3, 'BUI':3.4, 'FWI':1}

list(dict_test.values())

input = np.array(list(dict_test.values())).reshape(1,12)
input

array([[ 1. ,  6. , 26. , 57. , 18. ,  0. , 65.7,  3.4,  7.6,  1.3,  3.4,
         1. ]])

In [106]:
input.shape

(1, 12)

In [111]:
input = pd.DataFrame(input)
input.columns=['day','month','RH','Ws','Rain','FFMC','DMC','DC','ISI','BUI','FWI','classes']
input

Unnamed: 0,day,month,RH,Ws,Rain,FFMC,DMC,DC,ISI,BUI,FWI,classes
0,1.0,6.0,26.0,57.0,18.0,0.0,65.7,3.4,7.6,1.3,3.4,1.0


In [112]:
pipe.predict(input)

array([26.])

l = ['2','3']
