# Predictions for counterfeit medicine sales

In [1]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np

In [2]:
#Read the file

datafile_train="/Users/Namdeo/Downloads/counterfeit_train.csv"
datafile_test="/Users/Namdeo/Downloads/counterfeit_test.csv"
bd_train=pd.read_csv(datafile_train)
bd_test=pd.read_csv(datafile_test)

In [3]:
bd_train.head()

Unnamed: 0,Medicine_ID,Counterfeit_Weight,DistArea_ID,Active_Since,Medicine_MRP,Medicine_Type,SidEffect_Level,Availability_rating,Area_Type,Area_City_Type,Area_dist_level,Counterfeit_Sales
0,RRA15,13.1,Area046,1995,160.2366,Antimalarial,critical,0.070422,DownTown,Tier 1,Small,1775.5026
1,YVV26,,Area027,1983,110.4384,Mstablizers,mild,0.013,CityLimits,Tier 3,Medium,3069.152
2,LJC15,9.025,Area046,1995,259.4092,Cardiac,mild,0.060783,DownTown,Tier 1,Small,2603.092
3,GWC40,11.8,Area046,1995,99.983,OralContraceptives,mild,0.065555,DownTown,Tier 1,Small,1101.713
4,QMN13,,Area019,1983,56.4402,Hreplacements,critical,0.248859,MidTownResidential,Tier 1,Small,158.9402


In [4]:
bd_train.shape

(6818, 12)

In [5]:
bd_train.dtypes

Medicine_ID             object
Counterfeit_Weight     float64
DistArea_ID             object
Active_Since             int64
Medicine_MRP           float64
Medicine_Type           object
SidEffect_Level         object
Availability_rating    float64
Area_Type               object
Area_City_Type          object
Area_dist_level         object
Counterfeit_Sales      float64
dtype: object

In [6]:
bd_train.describe()

Unnamed: 0,Counterfeit_Weight,Active_Since,Medicine_MRP,Availability_rating,Counterfeit_Sales
count,5652.0,6818.0,6818.0,6818.0,6818.0
mean,14.115057,1995.836316,151.401518,0.079174,2280.58348
std,4.649668,8.368979,62.203961,0.051481,1693.354404
min,5.855,1983.0,41.79,0.013,146.29
25%,9.995,1985.0,104.5094,0.040058,933.2656
50%,13.8,1997.0,153.1957,0.066955,1902.6704
75%,18.05,2002.0,196.14835,0.107697,3207.6384
max,22.65,2007.0,277.1884,0.341391,13199.9648


In [7]:
#Deleting column with more unique values

for col in [ 'Medicine_ID']:
    bd_train.drop(col,1,inplace=True)
    bd_test.drop(col,1,inplace=True)

In [8]:
#Creating dummies for character columns

for col in ['Medicine_Type','SidEffect_Level','Area_Type','Area_City_Type','Area_dist_level',"DistArea_ID"]:  
    temp=pd.get_dummies(bd_train[col],prefix=col,drop_first=True)
    bd_train=pd.concat([temp,bd_train],1)
    bd_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(bd_test[col],prefix=col,drop_first=True)
    bd_test=pd.concat([temp,bd_test],1)
    bd_test.drop([col],1,inplace=True)

In [9]:
#Cheking null values

bd_train.isnull().sum()

DistArea_ID_Area013                    0
DistArea_ID_Area017                    0
DistArea_ID_Area018                    0
DistArea_ID_Area019                    0
DistArea_ID_Area027                    0
DistArea_ID_Area035                    0
DistArea_ID_Area045                    0
DistArea_ID_Area046                    0
DistArea_ID_Area049                    0
Area_dist_level_Medium                 0
Area_dist_level_Small                  0
Area_dist_level_Unknown                0
Area_City_Type_Tier 2                  0
Area_City_Type_Tier 3                  0
Area_Type_DownTown                     0
Area_Type_Industrial                   0
Area_Type_MidTownResidential           0
SidEffect_Level_mild                   0
Medicine_Type_Antacids                 0
Medicine_Type_Antibiotics              0
Medicine_Type_Antifungal               0
Medicine_Type_Antimalarial             0
Medicine_Type_Antipyretics             0
Medicine_Type_Antiseptics              0
Medicine_Type_An

In [10]:
#Imputing mean in place of null values

missing_col = ['Counterfeit_Weight']

for i in missing_col:
    bd_train.loc[bd_train.loc[:,i].isnull(),i]=bd_train.loc[:,i].mean()

In [11]:
missing_col = ['Counterfeit_Weight']

for i in missing_col:
    bd_test.loc[bd_test.loc[:,i].isnull(),i]=bd_test.loc[:,i].mean()

# Preparing data to fit the model

In [12]:
target='Counterfeit_Sales'

In [13]:
x_train=bd_train.drop(target,1)
y_train=bd_train[target]

# Fitting the model

In [14]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV

In [15]:
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.5f} (std: {1:.5f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")

In [16]:
param_dist = {"n_estimators":[50,100,200],
              "max_features": [2,4,5,6,8],
              "bootstrap": [True, False],
                'max_depth':[None,5,10,15,20,30,50,70],
                'min_samples_leaf':[1,2,5,10,15,20], 
                'min_samples_split':[2,5,10,15,20]
                  }

In [17]:
reg=RandomForestRegressor()

In [18]:
n_iter_search = 10
random_search=RandomizedSearchCV(reg,
                                 cv=10,
                                 param_distributions=param_dist,
                                 scoring='neg_mean_absolute_error',
                                 n_iter=10,n_jobs=-1,verbose=20
                                    )

In [19]:
random_search.fit(x_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   3 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done   4 tasks      | elapsed:    5.8s
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   6 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   7 tasks      | elapsed:    5.9s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    6.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    6.4s
[Parallel(n_jobs=-1)]: Done  11 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done  12 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  13 tasks      | elapsed:    6.7s
[Parallel(n_jobs=-1)]: Done  14 tasks      | elapsed:    6.8s
[Parallel(n_jobs=-1)]: Done  15 tasks      | elapsed:   

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_st...


In [20]:
report(random_search.cv_results_,5)

Model with rank: 1
Mean validation score: -797.72811 (std: 20.73870)
Parameters: {'n_estimators': 50, 'min_samples_split': 20, 'min_samples_leaf': 5, 'max_features': 5, 'max_depth': 30, 'bootstrap': False}

Model with rank: 2
Mean validation score: -802.01069 (std: 21.42458)
Parameters: {'n_estimators': 50, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 4, 'max_depth': 50, 'bootstrap': False}

Model with rank: 3
Mean validation score: -821.17317 (std: 24.94259)
Parameters: {'n_estimators': 100, 'min_samples_split': 10, 'min_samples_leaf': 10, 'max_features': 5, 'max_depth': 15, 'bootstrap': True}

Model with rank: 4
Mean validation score: -828.78737 (std: 21.93841)
Parameters: {'n_estimators': 200, 'min_samples_split': 10, 'min_samples_leaf': 2, 'max_features': 2, 'max_depth': 70, 'bootstrap': False}

Model with rank: 5
Mean validation score: -865.86070 (std: 33.94565)
Parameters: {'n_estimators': 50, 'min_samples_split': 2, 'min_samples_leaf': 15, 'max_features': 4, '

# Predictions

In [21]:
Predictions=pd.DataFrame({target:random_search.predict(bd_test)})