# ~MegaMachine~

**Copyright@~MEGAMACHINE~**; 

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. You may obtain a copy of the License at https://www.apache.org/licenses/LICENSE-2.0. Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.

**Disclosure: Nothing in this lecture should be considered as investment or business advices. Past performance is not necessarily indicative of future returns. Predatory Pricing Dataset and Customers Dataset during these lectures is only taken as the general example to show, how one can do data analysis using pandas (in python). I AM NOT REPOSIBLE FOR YOUR ANY KIND OF LOSS/PROFIT IN/ON YOUR BUSINESS/STOCKS RETURNS. Consider a financial adviser before investing or invest at your own risk.**

In [1]:

import numpy as np, pandas as pd, matplotlib.pyplot as plt


print('numpy version:',np.__version__)
print('pandas version: ',pd.__version__)
print('pyplot: ',plt)

numpy version: 1.23.1
pandas version:  1.4.3
pyplot:  <module 'matplotlib.pyplot' from 'C:\\Users\\ankit.k\\AppData\\Local\\Programs\\Python\\Python310\\lib\\site-packages\\matplotlib\\pyplot.py'>


# About Dataset – Predatory Pricing Dataset from Amazon

Description: Data on the unfair pricing and discounting practices by sellers on amazon during the COVID-19 crisis. Sellers on Amazon are using the Covid19 crisis to exploit the consumers. This data contains information on the product details, prices, discounts, reviews, listed date, and other key data points to understand the unfair practices.

Dataset source ([data.world](https://data.world/)) https://data.world/data-hut/predatory-pricing-data-from-amazon 

Dataset download links ([amazon-final.csv](https://query.data.world/s/mswpa2fvoyy4fsycneexuogbahgaxm), [amazon_2020-03-09(1).csv](https://query.data.world/s/y5m7cu66tmmmfpmm3wgh6hbvkapmaa)), direct link: [direct_drive_link](https://drive.google.com/drive/folders/1HoaQ_ijXxNx0h3VHDnUY5o5867YhJMI4?usp=sharing)

**NOTE: Data is available under education license only. Don’t use dataset other than educational purposes.**


In [None]:
#!wget -O predatory_pricing_processed https://query.data.world/s/oh2altdijecuodbyfb4xqumnwgnecq 
#!wget -O predatory_pricing_unprocessed https://query.data.world/s/cogmqzeegvpziizs7nahhvbbj3uhdq

# Read dataset 
processed_data_file_link_csv='https://query.data.world/s/oh2altdijecuodbyfb4xqumnwgnecq'
data=pd.read_csv(processed_data_file_link_csv,sep=';')

unprocessed_data_file_link_csv='https://query.data.world/s/cogmqzeegvpziizs7nahhvbbj3uhdq'
data_raw=pd.read_csv(unprocessed_data_file_link_csv,sep=',')

## processed data 

In [None]:
data.head(3)

In [None]:
# rename columns 
column_dict=dict(zip(data.columns,['_'.join(c_names.split(' ')) for c_names in data.columns]))
# 
data.rename(columns=column_dict,inplace=True)

# Make a copy 
data_original=data.copy()

# Remove unwanted columns 
data.drop(labels=None,axis=0,index=None,columns=['product_name','product_url','image_url','product_description',
                                                 'date_first_available'],level=None,inplace=True,errors='raise')

# re-arange columns 
data=data[data.columns.insert(2,data.columns[-1])[:-1]]

data.head(3)

In [None]:
# data size 
data.shape

In [None]:
# Get info 
data.info()

In [None]:
# get null values 
data.isna().sum()

### change data type

In [None]:
# change discount_percentage type (remove "%") ..
data.discount_percentage=data.discount_percentage.apply(lambda value: float(value.split('%')[0])/100,convert_dtype=True, args=())

In [None]:
# info 
data.info()

### procesed strings 

In [None]:
# get brand name set 
print('unique value counts: ',len(data.brand_name.unique()),'out of',data.shape[0])
data.brand_name.unique()

In [None]:
# remove punctuation
from string import punctuation
# maketrans: x->y, z->None } mapping 
# translate: -> map x->y using mapper (e.g. dict)

data.brand_name.apply(lambda value: '_'.join(value.lower().translate(str.maketrans('','',punctuation)).split(' '))).unique()

In [None]:
# apply to - brand_name,seller_name

print('intinal brand_name counts:',len(data.brand_name.unique()))
print('intinal seller_name counts:',len(data.seller_name.unique()))

data.brand_name=data.brand_name.apply(lambda value: '_'.join(value.lower().translate(str.maketrans('','',punctuation)).split(' ')))
data.seller_name=data.seller_name.apply(lambda value: '_'.join(value.lower().translate(str.maketrans('','',punctuation)).split(' ')))

print('final brand_name counts:',len(data.brand_name.unique()))
print('final seller_name counts:',len(data.seller_name.unique()))

### under value brand_name, seller_name

In [None]:
# 
data.describe()[['mrp','sale_price','discount_percentage']].T

In [None]:
# get under value brand_name, seller_name
data_uv=data[data.discount_percentage>data.discount_percentage.mean()]
#
print('number of brand_name under value',len(data_uv.brand_name.unique()))
print(sorted(data_uv.brand_name.unique(),reverse=False))
print('number of brand_name under value',len(data_uv.seller_name.unique()))
print(sorted(data_uv.seller_name.unique(),reverse=False))

In [None]:
# get high under value brand_name, seller_name
data_huv=data[data.discount_percentage>data.discount_percentage.mean()*1.5]
#
print('number of brand_name under value',len(data_huv.brand_name.unique()))
print(sorted(data_huv.brand_name.unique(),reverse=False))
print('number of brand_name under value',len(data_huv.seller_name.unique()))
print(sorted(data_huv.seller_name.unique(),reverse=False))

## count plots 

In [None]:
# plot figure 
plt.figure(figsize=(20,7))
# plot unique value count 
data.nunique().plot.bar()
plt.show()

### two class

In [None]:
# make classes - two
data['true_2_cls']=['uv' if value>data.discount_percentage.mean() else 'nuv' for value in data.discount_percentage]

In [None]:
# import 
import seaborn as sns

# plot pair plot 
sns.pairplot(data[['mrp','sale_price','discount_percentage','true_2_cls']],hue='true_2_cls')
plt.plot()

### three class

In [None]:
# make classes - two
data['true_3_cls']=['nuv' if value<data.discount_percentage.mean() else 'huv' if value>data.discount_percentage.mean()*1.5 else 'uv' for value in data.discount_percentage]

In [None]:
# plot pair plot 
sns.pairplot(data[['mrp','sale_price','discount_percentage','true_3_cls']],hue='true_3_cls')
plt.plot()

#### worse five products 

In [None]:
worse_sorted=data_original.sort_values('discount_percentage',axis=0,ascending=False,inplace=False,
                                       kind='quicksort',na_position='last',ignore_index=False,key=None)
data_original.iloc[:5]['product_url'].values  

## prediction (sample)

In [None]:
# data 
X=data.loc[:,['discount_percentage']].values

# import function 
from sklearn.preprocessing import normalize
# doc -> https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer

X_norm,norms_of_x=normalize(X,norm='l2',axis=0,copy=True,return_norm=True)

# import module
from sklearn.cluster import KMeans
# source: https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html

clusterer_=KMeans(n_clusters=2,random_state=10,n_init=3,max_iter=3)
clusterer_.fit(X_norm)
print('cluster_centers_:',clusterer_.cluster_centers_)

In [None]:
# import Label-Encoder
from sklearn.preprocessing import LabelEncoder

encoder_cls_2_=LabelEncoder()

plt.figure(figsize=(15,10))
#plt.scatter(range(data.discount_percentage.shape[0]),data.discount_percentage,c=encoder_cls_2_.fit_transform(data.true_2_cls.values))
plt.scatter(range(data.discount_percentage.shape[0]),data.discount_percentage,c=['r' if value==0 else 'g' for value in clusterer_.labels_],marker='1')
plt.plot([0,data.discount_percentage.shape[0]],[data.discount_percentage.mean()]*2)
plt.grid()
plt.show()

## feature_selection

In [None]:
data.columns

In [None]:
# data 
X_fs=data.values[:,1:-3]
# labels 
y_fs=data.true_3_cls.values

# preprocess 

# import Label-Encoder
from sklearn.preprocessing import LabelEncoder

# make encoder 
encoder_cls_3_brand_name=LabelEncoder()
encoder_cls_3_seller_name=LabelEncoder()
encoder_cls_labels=LabelEncoder()
# transform
X_fs[:,0]=encoder_cls_3_brand_name.fit_transform(X_fs[:,0])
X_fs[:,1]=encoder_cls_3_seller_name.fit_transform(X_fs[:,1])
y_fs=encoder_cls_labels.fit_transform(y_fs)

# copy data
X_fs_norm=X_fs.copy()

# normalize - import function 
from sklearn.preprocessing import normalize
# doc -> https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html#sklearn.preprocessing.Normalizer

X_fs_norm[:,2:4],norms_of_x=normalize(X_fs_norm[:,2:4],norm='l1',axis=0,copy=True,return_norm=True)

X_fs[:2],X_fs_norm[:2],set(y_fs)

In [None]:
# source - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html
# source - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html
# source - https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
from sklearn.model_selection import cross_validate,KFold,GridSearchCV
# source - https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html
from sklearn.feature_selection import RFE


# creating a KFold object with 5 splits 
folds = KFold(n_splits=5,shuffle=True,random_state=100)

# specify range of hyperparameters
hyper_params = [{'n_features_to_select':list(range(1,X_fs.shape[1]))}]

# load model 
from sklearn.tree import DecisionTreeClassifier
# source: https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

# make model - Test classifier - DecisionTreeRegressor
test_tree_single=DecisionTreeClassifier()
# train 
test_tree_single.fit(X_fs_norm,y_fs)

# set up GridSearchCV()
# score - source: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter

model_f_select = GridSearchCV(estimator=RFE(test_tree_single),param_grid=hyper_params,scoring=None, 
                              cv=folds,verbose=1,return_train_score=True) 

# fit the model
model_f_select.fit(X_fs_norm,y_fs) 

In [None]:
# cv results
results_ = pd.DataFrame(model_f_select.cv_results_)
results_

In [None]:
# plotting cv results
plt.figure(figsize=(16,6))

plt.plot(results_["param_n_features_to_select"], results_["mean_test_score"])
plt.plot(results_["param_n_features_to_select"], results_["mean_train_score"])
plt.xlabel('number of features')
plt.ylabel('r-squared')
plt.title("Optimal Number of Features")
plt.legend(['test score', 'train score'], loc='upper left')
plt.show()

## Classifier comparison 

In [None]:

# call main classifiers - 

#from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

# names 
names=["Nearest Neighbors","Linear SVM","Decision Tree"]
# make classifiers
classifiers_=[KNeighborsClassifier(n_neighbors=3),SVC(kernel='linear'),
              DecisionTreeClassifier()]

# split
from sklearn.model_selection import train_test_split
# Source: https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html

# labels
y_fs_2_cls=data.true_2_cls.values
# fit transform
label_encoder_2_cls=LabelEncoder()
y_fs_2_cls=label_encoder_2_cls.fit_transform(y_fs_2_cls)

# split data - only by 'brand_name', 'seller_name'
X_train,X_test,y_train,y_test=train_test_split(X_fs_norm[:,:2],y_fs_2_cls,test_size=0.30,random_state=8)

# shape 
X_train.shape,X_test.shape,y_train.shape,y_test.shape,set(y_fs_2_cls)

In [None]:
# make mesh 
h = .02  # step size in the mesh
x_min, x_max = X_fs_norm[:, 0].min()-.5,X_fs_norm[:, 0].max()+.5
y_min, y_max = X_fs_norm[:, 1].min()-.5,X_fs_norm[:, 1].max()+.5
xx,yy=np.meshgrid(np.arange(x_min,x_max,h),np.arange(y_min,y_max,h))

# get color map 
from matplotlib.colors import ListedColormap
#
cmap=ListedColormap(['#FF0000','#0000FF'])
#
cm=plt.cm.RdBu

fig=plt.figure(figsize=(15,10))

# plot train 
ax=fig.add_subplot(2,len(classifiers_)+1,1)
# Plot the training points
ax.scatter(X_train[:,0],X_train[:,1],c=y_train,cmap=cmap,edgecolors='k')
ax.set_xlim(xx.min(),xx.max()),ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(()),ax.set_yticks(())

# plot test 
ax=fig.add_subplot(2,len(classifiers_)+1,len(classifiers_)+2)
# Plot the testing points
ax.scatter(X_test[:,0],X_test[:,1],cmap=cmap,c=y_test,edgecolors='k')
ax.set_xlim(xx.min(),xx.max()),ax.set_ylim(yy.min(), yy.max())
ax.set_xticks(()),ax.set_yticks(())

# iterate over dataset - traning
for plt_number,name_n_clf in enumerate(zip(names,classifiers_),2):

  ax=fig.add_subplot(2,len(classifiers_)+1,plt_number)

  # train /fit data
  name_n_clf[1].fit(X_train,y_train)
  score=name_n_clf[1].score(X_test,y_test)

  # Plot the decision boundary. For that, we will assign a color to each
  # point in the mesh [x_min, x_max]x[y_min, y_max].
  if hasattr(name_n_clf[1], "decision_function"): 
    Z=name_n_clf[1].decision_function(np.c_[xx.ravel(),yy.ravel()])
  else:
    Z=name_n_clf[1].predict_proba(np.c_[xx.ravel(),yy.ravel()])[:,1]

  # Put the result into a color plot
  Z=Z.reshape(xx.shape)
  ax.contourf(xx,yy,Z,cmap=cm,alpha=.8)

  # Plot the training points
  ax.scatter(X_train[:, 0],X_train[:, 1],c=y_train,cmap=cmap,edgecolors='k')
  #
  ax.set_xlim(xx.min(),xx.max()),ax.set_ylim(yy.min(), yy.max())
  ax.set_xticks(()),ax.set_yticks(())

  # Plot test points 
  ax=fig.add_subplot(2,len(classifiers_)+1,4+plt_number)
  ax.contourf(xx,yy,Z,cmap=cm,alpha=.8)
  # plot points - test
  ax.scatter(X_test[:,0],X_test[:,1],c=y_test,cmap=cmap,edgecolors='k')
  #
  ax.set_xlim(xx.min(),xx.max()),ax.set_ylim(yy.min(), yy.max())
  ax.set_xticks(()),ax.set_yticks(())
  ax.text(xx.max()-.3,yy.min()+.3,('%.2f'%score).lstrip('0'),size=15,horizontalalignment='right')

plt.tight_layout()
plt.show()