In [50]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import preprocessing

In [51]:
from sklearn.linear_model import LinearRegression, Ridge, SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn import svm

In [52]:
data  = pd.read_csv('problem_2_train.csv',parse_dates=['memcreateddate', 'mobilepurchasedate','claimintdate','pickupStartdate'])
data.head()

Unnamed: 0,srno,mobmake,hubid,hubpincode,pickuppartnerid,deliverypartnerid,insuranceCompanyId,custpincodecategory,claimtype,custpincode,memcreateddate,mobilepurchasedate,claimintdate,servicecntrid,pickupStartdate,tat_in_days
0,2851218,AN,7,500004,233,233,131,A,ADLD,500001,2017-04-27,2016-12-02,2018-02-08,4403,2018-02-13,5
1,2838330,AJ,10,226010,233,233,131,A,ADLD,226022,2017-03-06,2017-03-05,2018-02-04,4444,2018-02-08,12
2,2835781,AJ,2,110018,233,233,228,A,ADLD,110018,2018-02-01,2018-01-27,2018-02-03,4388,2018-02-12,9
3,2838589,AM,2,110018,233,233,228,B,ADLD,124001,2018-01-17,2018-01-31,2018-02-04,4092,2018-02-14,9
4,2855214,AN,21,110015,233,233,131,A,ADLD,201301,2018-01-23,2018-01-20,2018-02-09,4580,2018-02-14,8


### Making new features as diffrerence of dates

In [53]:
data['mem-purchase'] = (data.mobilepurchasedate - data.memcreateddate).dt.days
data['pickup-claim'] = (data.pickupStartdate - data.claimintdate).dt.days

### Making maps for categorical data....
### Alternative way for Label encoding

In [54]:
claim = list(data.claimtype.unique())
mob = list(data.mobmake.unique())
pin_cat  = list(data.custpincodecategory.unique())

claim_map = {k:v for v,k in enumerate(claim)}
mob_map = {k:v for v,k in enumerate(mob)}
pin_map = {k:v for v,k in enumerate(pin_cat)}

In [55]:

data.claimtype = data.claimtype.map(claim_map)
data.mobmake = data.mobmake.map(mob_map)
data.custpincodecategory = data.custpincodecategory.map(pin_map)

data.head()

{'A': 0, 'B': 1, 'C': 2, '0': 3, 'D': 4}


Unnamed: 0,srno,mobmake,hubid,hubpincode,pickuppartnerid,deliverypartnerid,insuranceCompanyId,custpincodecategory,claimtype,custpincode,memcreateddate,mobilepurchasedate,claimintdate,servicecntrid,pickupStartdate,tat_in_days,mem-purchase,pickup-claim
0,2851218,0,7,500004,233,233,131,0,0,500001,2017-04-27,2016-12-02,2018-02-08,4403,2018-02-13,5,-146,5
1,2838330,1,10,226010,233,233,131,0,0,226022,2017-03-06,2017-03-05,2018-02-04,4444,2018-02-08,12,-1,4
2,2835781,1,2,110018,233,233,228,0,0,110018,2018-02-01,2018-01-27,2018-02-03,4388,2018-02-12,9,-5,9
3,2838589,2,2,110018,233,233,228,1,0,124001,2018-01-17,2018-01-31,2018-02-04,4092,2018-02-14,9,14,10
4,2855214,0,21,110015,233,233,131,0,0,201301,2018-01-23,2018-01-20,2018-02-09,4580,2018-02-14,8,-3,5


In [56]:
data.columns

Index(['srno', 'mobmake', 'hubid', 'hubpincode', 'pickuppartnerid',
       'deliverypartnerid', 'insuranceCompanyId', 'custpincodecategory',
       'claimtype', 'custpincode', 'memcreateddate', 'mobilepurchasedate',
       'claimintdate', 'servicecntrid', 'pickupStartdate', 'tat_in_days',
       'mem-purchase', 'pickup-claim'],
      dtype='object')

In [57]:
X = data[['mobmake', 'hubid', 'hubpincode', 'insuranceCompanyId', 'custpincodecategory',
       'claimtype', 'custpincode','servicecntrid','mem-purchase', 'pickup-claim']]
y = data.tat_in_days

In [58]:
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.3)

### Linear Regression

In [59]:
lr = LinearRegression()
lr.fit(X_train,y_train)
lr_pred = lr.predict(X_test)
mse = metrics.mean_squared_error(y_test,lr_pred)

print('Training Score:',lr.score(X_train, y_train))
print('Testing Score:',lr.score(X_test, y_test))
print('Root Mean squared Error:',np.sqrt(mse))

Training Score: 0.1756932215296435
Testing Score: 0.1786140195545891
Root Mean squared Error: 4.7113178191518505


### Random Forrest Regressor

In [60]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
rf_pred = rf.predict(X_test)
mse = metrics.mean_squared_error(y_test,rf_pred)

print('Training Score:',rf.score(X_train, y_train))
print('Testing Score:',rf.score(X_test, y_test))
print('Root Mean squared Error:',np.sqrt(mse))

Training Score: 0.8413565624963819
Testing Score: 0.1752297369630379
Root Mean squared Error: 4.721013651199795




### KNN Regressor

In [61]:
knn = KNeighborsRegressor()
knn.fit(X_train,y_train)
knn_pred = knn.predict(X_test)
mse = metrics.mean_squared_error(y_test,knn_pred)

print('Training Score:',knn.score(X_train, y_train))
print('Testing Score:',knn.score(X_test, y_test))
print('Root Mean squared Error:',np.sqrt(mse))

Training Score: 0.37751848067630545
Testing Score: 0.08683460416236854
Root Mean squared Error: 4.967564124732899


### Ridge Regressor

In [62]:
ridge = Ridge()
ridge.fit(X_train,y_train)
ridge_pred = ridge.predict(X_test)
mse = metrics.mean_squared_error(y_test,ridge_pred)

print('Training Score:',ridge.score(X_train, y_train))
print('Testing Score:',ridge.score(X_test, y_test))
print('Root Mean squared Error:',np.sqrt(mse))

Training Score: 0.1752930908154919
Testing Score: 0.17864766375645377
Root Mean squared Error: 4.711221329709188


### SVM Regressor

In [63]:
svc = svm.SVR()
svc.fit(X_train,y_train)
svc_pred = svc.predict(X_test)
mse = metrics.mean_squared_error(y_test,svc_pred)

print('Training Score:',svc.score(X_train, y_train))
print('Testing Score:',svc.score(X_test, y_test))
print('Root Mean squared Error:',np.sqrt(mse))



Training Score: 0.2252224101128374
Testing Score: -0.03144048154232504
Root Mean squared Error: 5.279476308904672


In [64]:
models = pd.DataFrame({
    'Model': ['Linear Regressor','Random Forest Regressor','KNN Regressor','Ridge Regressor','SVM Regressor'],
    'Training Score': [ lr.score(X_train, y_train),rf.score(X_train, y_train), knn.score(X_train, y_train), ridge.score(X_train, y_train),svc.score(X_train, y_train)],
    'Test Score': [ lr.score(X_test, y_test),rf.score(X_test, y_test), knn.score(X_test, y_test), ridge.score(X_test, y_test),svc.score(X_test, y_test),]
    })
models.sort_values(by='Test Score', ascending=False)

Unnamed: 0,Model,Training Score,Test Score
3,Ridge Regressor,0.175293,0.178648
0,Linear Regressor,0.175693,0.178614
1,Random Forest Regressor,0.841357,0.17523
2,KNN Regressor,0.377518,0.086835
4,SVM Regressor,0.225222,-0.03144


#### Random Forrest is giving maximum Test Score, so using GridSearch to find best hyper-parameters for the same

In [65]:
parameters = [{'max_depth' : [5, 7, 9, 11, 12],
               'max_features' : [2, 3, 4, 5],
               'n_estimators' : [10, 15, 20, 17, 22, 25],
               'random_state' : [7, 8, 10, 12]}]

from sklearn.model_selection import GridSearchCV
grid_search = GridSearchCV(estimator = rf,param_grid = parameters,
                           scoring = 'neg_mean_squared_error',n_jobs = -1, cv = 10)

grid_search.fit(X_train, y_train)

best_accuracy = grid_search.best_score_
print("Best accuracy of the model for the training set is:", best_accuracy)

best_params = grid_search.best_params_
print("Best parameters of the model for the training set is:", best_params)

Best accuracy of the model for the training set is: -19.221058106378536
Best parameters of the model for the training set is: {'max_depth': 9, 'max_features': 3, 'n_estimators': 25, 'random_state': 12}


### Using GridCV Parameters

In [66]:
regressor = RandomForestRegressor(max_depth = 11, max_features = 3,
                                  n_estimators = 25, random_state = 7)
regressor.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=25, n_jobs=None, oob_score=False, random_state=7,
           verbose=0, warm_start=False)

### Applying k-fold cross validation again to check mean accuracies in dataset


In [67]:

from sklearn.model_selection import cross_val_score

accuracies = cross_val_score(estimator = regressor, X = X_train, 
                             y = y_train, cv = 10, n_jobs = -1)

print("Showing all 10 of K-Fold Cross Validation accuracies:\n", accuracies)
accuracies_mean = accuracies.mean()
print("\nMean of accuracies:\n", accuracies_mean)
accuracies_std = accuracies.std()
print("\nStandard Deviation:\n", accuracies_std)

Showing all 10 of K-Fold Cross Validation accuracies:
 [0.26864368 0.2433265  0.18080777 0.16960346 0.20801577 0.21701593
 0.20276045 0.28876539 0.29722248 0.19926379]

Mean of accuracies:
 0.22754252226635066

Standard Deviation:
 0.04241342669522615


## Now training our classifier on full training set

In [68]:
regressor.fit(X,y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=11,
           max_features=3, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=25, n_jobs=None, oob_score=False, random_state=7,
           verbose=0, warm_start=False)

### According to above findings we use the following classifier for our predictions

In [69]:
test = pd.read_csv('problem_2_test.csv',parse_dates=['memcreateddate', 'mobilepurchasedate',
       'claimintdate','pickupStartdate'])
test.head()

test['mem-purchase'] = (test.mobilepurchasedate - test.memcreateddate).dt.days
test['pickup-claim'] = (test.pickupStartdate - test.claimintdate).dt.days

In [70]:
test.claimtype = test.claimtype.map(claim_map)
test.mobmake = test.mobmake.map(mob_map)
test.custpincodecategory = test.custpincodecategory.map(pin_map)
test.head()

Unnamed: 0,srno,mobmake,hubid,hubpincode,pickuppartnerid,deliverypartnerid,insuranceCompanyId,custpincodecategory,claimtype,custpincode,memcreateddate,mobilepurchasedate,claimintdate,servicecntrid,pickupStartdate,mem-purchase,pickup-claim
0,2829088,0.0,3,560011,233,233,131,0,0,560093,2017-07-30,2017-07-27,2018-02-01,4467,2018-02-06,-3,5
1,2829866,1.0,2,110018,233,233,228,0,0,110034,2017-07-17,2017-01-25,2018-02-01,4388,2018-02-08,-173,7
2,2894489,1.0,3,560011,233,233,228,2,0,590014,2018-01-12,2018-01-11,2018-02-21,4460,2018-02-28,-1,7
3,2876708,1.0,2,110018,233,233,228,1,0,250110,2017-08-07,2017-05-21,2018-02-16,4388,2018-02-26,-78,10
4,2890283,0.0,22,400104,233,233,131,2,0,496001,2017-11-10,2017-09-22,2018-02-20,4581,2018-02-23,-49,3


### Checking for null values in test data



In [71]:
print(test.isna().sum())

srno                   0
mobmake                4
hubid                  0
hubpincode             0
pickuppartnerid        0
deliverypartnerid      0
insuranceCompanyId     0
custpincodecategory    0
claimtype              0
custpincode            0
memcreateddate         0
mobilepurchasedate     0
claimintdate           0
servicecntrid          0
pickupStartdate        0
mem-purchase           0
pickup-claim           0
dtype: int64


### Filling null values with median value of column



In [72]:
test.fillna(test.mean(),inplace=True)

### Creating a new file for saving the predicted labels for test data



In [73]:
test = test[X.columns]

In [74]:
answers = pd.DataFrame(regressor.predict(test))
answers.to_csv('test_data_answers.csv', index=False,header=['Dataset'])

### Appending to test data and saving file


In [75]:
test['Dataset'] = answers
test.head()
test.to_csv('test_data with answers.csv')