In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

train_file = r'D:\DS_and_AI\ML_Python\Project\Project_3\counterfeit_train.csv'
test_file = r'D:\DS_and_AI\ML_Python\Project\Project_3\counterfeit_test.csv'

data_train = pd.read_csv(train_file)
data_test = pd.read_csv(test_file)

#adding Counterfeit_Sales column to test data
data_test['Counterfeit_Sales'] = np.nan

#adding data column to test and train columns
data_test['data'] = 'test'
data_train['data'] = 'train'

#combining test and train data
dataset = pd.concat([data_train,data_test])

#dropping columns - Active_Since,Medicine_ID,DistArea_ID,SidEffect_Level,Area_dist_level
dataset.drop(['Active_Since','Medicine_ID','DistArea_ID','SidEffect_Level','Area_dist_level'],axis=1,inplace=True)

#filling NAs with mean
dataset['Counterfeit_Weight'] = dataset.fillna(dataset.mean())

#creating dummies
cat_col=['Medicine_Type' , 'Area_Type', 'Area_City_Type']

for col in cat_col :
    k=dataset[col].value_counts(dropna=False)
    cats=k.index[k>50][:-1]
    for cat in cats:
            name=col+'_'+cat
            dataset[name]=(dataset[col]==cat).astype(int)
        
    del dataset[col]

# converting datatype of Counterfeit_Weight - to numeric
dataset['Counterfeit_Weight'] = pd.to_numeric(dataset['Counterfeit_Weight'],errors='coerce')

#seperating  the training  and testing data after data preprocessing
dataset_train = dataset.loc[dataset['data']=='train']
dataset_test = dataset.loc[dataset['data']=='test']

#dropping data column from test and train dataset
dataset_train.drop(['data'],axis=1,inplace=True)
dataset_test.drop(['data'],axis=1,inplace=True)

#dropping dummy Counterfeit_Sales sales column from dataset_test
dataset_test.drop(['Counterfeit_Sales'],axis=1,inplace=True)

#splitting the data into test and train
from sklearn.model_selection import train_test_split
train,test = train_test_split(dataset_train,test_size=0.2,random_state=1)

x_train = train.drop(['Counterfeit_Sales'],axis=1)
y_train = train['Counterfeit_Sales']
x_test = test.drop(['Counterfeit_Sales'],axis=1)
y_test = test['Counterfeit_Sales']

y_test = pd.DataFrame(y_test)

#writing y_pred values to excel
with pd.ExcelWriter('test_predict_original_2.xlsx') as writer:
    y_test.to_excel(writer,"Sheet1",index=False)
    y_test.to_excel(writer,"Sheet2")

In [8]:
gbm_params={'n_estimators':[50,100,200],
'learning_rate': [0.01,.05,0.1,0.4,0.8,1],
'max_depth':[1,2,3,4,5,6],
'subsample':[0.5,0.8,1],
'max_features':[5,6,7,8]
}

In [9]:
gbm_params

{'n_estimators': [50, 100, 200],
 'learning_rate': [0.01, 0.05, 0.1, 0.4, 0.8, 1],
 'max_depth': [1, 2, 3, 4, 5, 6],
 'subsample': [0.5, 0.8, 1],
 'max_features': [5, 6, 7, 8]}

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV

In [5]:
model = GradientBoostingRegressor()

In [10]:
random_search=RandomizedSearchCV(model,scoring='neg_mean_absolute_error',
param_distributions=gbm_params,
cv=10,n_iter=10,
n_jobs=-1,verbose=False)

In [11]:
random_search.fit(x_train,y_train)

RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=GradientBoostingRegressor(alpha=0.9,
                                                       criterion='friedman_mse',
                                                       init=None,
                                                       learning_rate=0.1,
                                                       loss='ls', max_depth=3,
                                                       max_features=None,
                                                       max_leaf_nodes=None,
                                                       min_impurity_decrease=0.0,
                                                       min_impurity_split=None,
                                                       min_samples_leaf=1,
                                                       min_samples_split=2,
                                                       min_weight_fraction_leaf=0.0,
                           

In [12]:
y_pred_gb = random_search.predict(x_test)

In [13]:
y_pred_gb

array([2722.15927582, 1557.42992174, 3409.7227596 , ...,  881.67409059,
       1696.95256878,  689.76901475])

In [21]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred_gb)

740.2255173897836

In [14]:
y_pred_gb = pd.DataFrame(y_pred_gb)

In [16]:
with pd.ExcelWriter('test_predict_gb.xlsx') as writer:
    y_pred_gb.to_excel(writer,"Sheet1",index=False)
    y_pred_gb.to_excel(writer,"Sheet2")

In [17]:
final_pred_gb = random_search.predict(dataset_test)

In [18]:
final_pred_gb

array([2518.26002682, 3588.36951253, 1479.81952975, ..., 3001.37034884,
       3127.6249207 , 4051.87154397])

In [19]:
final_pred_gb = pd.DataFrame(final_pred_gb)

In [20]:
with pd.ExcelWriter('final_predict_gb.xlsx') as writer:
    final_pred_gb.to_excel(writer,"Sheet1",index=False)
    final_pred_gb.to_excel(writer,"Sheet2")

# XGB

In [23]:
from xgboost.sklearn import XGBRegressor


In [24]:
xgb_params={
"gamma":[0,2,5,8,10],
"max_depth": [2,3,4,5,6,7,8],
"min_child_weight":[0.5,1,2,5,10]
}

xgb2=XGBRegressor(n_estimators=25,subsample=0.8,
colsample_bylevel=0.8,colsample_bytree=0.8)
random_search = RandomizedSearchCV( xgb2, param_distributions = xgb_params,
n_iter = 20, cv= 10,
scoring ='neg_mean_absolute_error',
n_jobs =-1, verbose=False)

In [25]:
random_search.fit(x_train,y_train)



RandomizedSearchCV(cv=10, error_score='raise-deprecating',
                   estimator=XGBRegressor(base_score=0.5, booster='gbtree',
                                          colsample_bylevel=0.8,
                                          colsample_bynode=1,
                                          colsample_bytree=0.8, gamma=0,
                                          importance_type='gain',
                                          learning_rate=0.1, max_delta_step=0,
                                          max_depth=3, min_child_weight=1,
                                          missing=None, n_estimators=25,
                                          n_jobs=1, nthread=None,
                                          objective='reg:linear',
                                          rando...lpha=0,
                                          reg_lambda=1, scale_pos_weight=1,
                                          seed=None, silent=None, subsample=0.8,
                         

In [26]:
y_pred_xgb = random_search.predict(x_test)

In [27]:
y_pred_xgb

array([2590.4648 , 1553.4476 , 2978.978  , ...,  875.9131 , 1883.9407 ,
        731.29865], dtype=float32)

In [28]:
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test,y_pred_xgb)

730.761241171909

In [29]:
y_pred_xgb = pd.DataFrame(y_pred_xgb)

In [35]:
with pd.ExcelWriter('test_predict_xgb.xlsx') as writer:
    y_pred_xgb.to_excel(writer,"Sheet1",index=False)
    y_pred_xgb.to_excel(writer,"Sheet2")

In [31]:
final_pred_xgb = random_search.predict(dataset_test)

In [32]:
final_pred_xgb = pd.DataFrame(final_pred_xgb)

In [36]:
with pd.ExcelWriter('final_predict_xgb.xlsx') as writer:
    final_pred_xgb.to_excel(writer,"Sheet1",index=False)
    final_pred_xgb.to_excel(writer,"Sheet2")