In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

train_file = r'D:\DS_and_AI\ML_Python\Project\Project_3\counterfeit_train.csv'
test_file = r'D:\DS_and_AI\ML_Python\Project\Project_3\counterfeit_test.csv'

data_train = pd.read_csv(train_file)
data_test = pd.read_csv(test_file)

#adding Counterfeit_Sales column to test data
data_test['Counterfeit_Sales'] = np.nan

#adding data column to test and train columns
data_test['data'] = 'test'
data_train['data'] = 'train'

#combining test and train data
dataset = pd.concat([data_train,data_test])

#dropping columns - Active_Since,Medicine_ID,DistArea_ID,SidEffect_Level,Area_dist_level
#dataset.drop(['Active_Since','Medicine_ID','DistArea_ID','SidEffect_Level','Area_dist_level'],axis=1,inplace=True)
dataset.drop(['Active_Since','Medicine_ID','DistArea_ID','Area_dist_level'],axis=1,inplace=True)


#filling NAs with mean
dataset['Counterfeit_Weight'] = dataset.fillna(dataset.mean())

#creating dummies
cat_col=['Medicine_Type' , 'Area_Type', 'Area_City_Type','SidEffect_Level']

for col in cat_col :
    k=dataset[col].value_counts(dropna=False)
    cats=k.index[k>50][:-1]
    for cat in cats:
            name=col+'_'+cat
            dataset[name]=(dataset[col]==cat).astype(int)
        
    del dataset[col]

# converting datatype of Counterfeit_Weight - to numeric
dataset['Counterfeit_Weight'] = pd.to_numeric(dataset['Counterfeit_Weight'],errors='coerce')

#seperating  the training  and testing data after data preprocessing
dataset_train = dataset.loc[dataset['data']=='train']
dataset_test = dataset.loc[dataset['data']=='test']

#dropping data column from test and train dataset
dataset_train.drop(['data'],axis=1,inplace=True)
dataset_test.drop(['data'],axis=1,inplace=True)

#dropping dummy Counterfeit_Sales sales column from dataset_test
dataset_test.drop(['Counterfeit_Sales'],axis=1,inplace=True)

#splitting the data into test and train
from sklearn.model_selection import train_test_split
train,test = train_test_split(dataset_train,test_size=0.2,random_state=1)

x_train = train.drop(['Counterfeit_Sales'],axis=1)
y_train = train['Counterfeit_Sales']
x_test = test.drop(['Counterfeit_Sales'],axis=1)
y_test = test['Counterfeit_Sales']

y_test = pd.DataFrame(y_test)

#writing y_pred values to excel
#with pd.ExcelWriter('test_predict_original_1.xlsx') as writer:
#    y_test.to_excel(writer,"Sheet1",index=False)
#    y_test.to_excel(writer,"Sheet2")

In [2]:
x_train

Unnamed: 0,Counterfeit_Weight,Medicine_MRP,Availability_rating,Medicine_Type_Antibiotics,Medicine_Type_Hreplacements,Medicine_Type_Antiseptics,Medicine_Type_OralContraceptives,Medicine_Type_Antipyretics,Medicine_Type_Cardiac,Medicine_Type_Mstablizers,...,Medicine_Type_Statins,Medicine_Type_MuscleRelaxants,Medicine_Type_Antifungal,Medicine_Type_Stimulants,Area_Type_DownTown,Area_Type_MidTownResidential,Area_Type_CityLimits,Area_City_Type_Tier 3,Area_City_Type_Tier 2,SidEffect_Level_mild
2490,10.495000,69.2246,0.128354,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,1
5270,11.495000,153.4154,0.013000,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,1,1
82,14.157645,199.7872,0.177966,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1
3960,12.300000,230.0798,0.146353,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
2362,20.650000,179.2816,0.078508,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
905,7.825000,97.0882,0.103279,0,0,1,0,0,0,0,...,0,0,0,0,1,0,0,0,1,1
5192,16.600000,114.3332,0.051460,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1
3980,12.800000,111.1358,0.138603,0,0,0,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
235,15.450000,207.2110,0.027719,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,1,0,1


In [3]:
from sklearn.model_selection import RandomizedSearchCV

In [4]:
params={  
        'max_depth':[None,5,10,15,20,30,50,70],
            'min_samples_leaf':[1,2,5,10,15,20], 
            'min_samples_split':[2,5,10,15,20]
       }

In [5]:
params

{'max_depth': [None, 5, 10, 15, 20, 30, 50, 70],
 'min_samples_leaf': [1, 2, 5, 10, 15, 20],
 'min_samples_split': [2, 5, 10, 15, 20]}

In [6]:
from sklearn.ensemble import RandomForestRegressor

In [7]:
reg = RandomForestRegressor()

In [8]:
random_search=RandomizedSearchCV(reg,cv=10,
                                 param_distributions=params,
                                 n_iter=10)

In [9]:
random_search

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_state=

In [10]:
random_search.fit(x_train,y_train)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_state=

In [11]:
y_pred_rf = random_search.predict(x_test)

In [12]:
y_pred_rf

array([2645.50554915, 1653.26913558, 3213.40197263, ...,  762.52020597,
       1813.67090454,  762.52020597])

In [13]:
y_pred_rf = pd.DataFrame(y_pred_rf)

In [14]:
#with pd.ExcelWriter('test_predict_rf.xlsx') as writer:
#    y_pred_rf.to_excel(writer,"Sheet1",index=False)
#    y_pred_rf.to_excel(writer,"Sheet2")

In [15]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred_rf)

717.1416404767402

In [16]:
final_pred_rf = random_search.predict(dataset_test)

In [17]:
final_pred_rf

array([2256.5472315 , 4147.12012722, 1593.41723162, ..., 2900.70452007,
       3666.96323477, 4071.12895516])

In [18]:
final_pred_rf = pd.DataFrame(final_pred_rf)

In [19]:
#with pd.ExcelWriter('final_predict_rf_new.xlsx') as writer:
 #   final_pred_rf.to_excel(writer,"Sheet1",index=False,columns=['Medicine_ID'])
  #  final_pred_rf.to_excel(writer,"Sheet2")