In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Basic Dependencies
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
#Read training Data
df_train = pd.read_csv("/kaggle/input/retailcleanfull/Train_clean.csv")
df_train.head()

In [None]:
#Read testing data
df_test = pd.read_csv("/kaggle/input/retailcleanfull/Test_clean.csv")
df_test.head()

In [None]:
#Column Types
df_train.info()

In [None]:
df=df_train

In [None]:
from numpy.random import seed
from numpy.random import randn
from numpy import percentile
# seed the random number generator
seed(1)
# generate univariate observations
df = df['Quantity']
# calculate interquartile range
q25, q75 = percentile(df, 25), percentile(df, 75)
iqr = q75 - q25
print('Percentiles: 25th=%.3f, 75th=%.3f, IQR=%.3f' % (q25, q75, iqr))
# calculate the outlier cutoff
cut_off = iqr * 1.5
lower, upper = q25 - cut_off, q75 + cut_off
# identify outliers
outliers = [x for x in df if x < lower or x > upper]
print('Identified outliers: %d' % len(outliers))
# remove outliers
outliers_removed = [x for x in df if x >= lower and x <= upper]
print('Non-outlier observations: %d' % len(outliers_removed))
df.shape

In [None]:
#Basic Stats
df_train.describe()

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
data = df_train

In [None]:
#Check for Missing Values
data.isnull().sum()

In [None]:
#Seperate Categorical and Numerical Columns
cat_cols = data.select_dtypes(include=['object','category']).columns.tolist()
print(cat_cols)

num_cols = data.select_dtypes(include=['int64','float64']).columns.tolist()
print(num_cols)

In [None]:
# Get number of unique entries in each column with categorical data
object_nunique = list(map(lambda col: data[col].nunique(), cat_cols))
d = dict(zip(cat_cols, object_nunique))

# Print number of unique entries by column, in ascending order
sorted(d.items(), key=lambda x: x[1])

In [None]:
data.head()

In [None]:
#Drop irrelavent columns

data1 = data.drop(['AM/PM_PM_sin','AM/PM_PM_cos','InvoiceNo','CustomerID','Time_sin','Time_cos','Year_cos'],1, inplace= False)
data1.head()

In [None]:
for col in ['UnitPrice']:
    min_thresold, max_thresold = data1[col].quantile([0.001, 0.999])
    min_thresold, max_thresold

    #data = data[(data.UnitPrice<35) & (data1.UnitPrice>0)]
    data1 = data1[(data1[col]<max_thresold) & (data1[col]>min_thresold)]


In [None]:
data1.shape

In [None]:
#Check for skewness in the dataset
data1.skew()

In [None]:
data1.UnitPrice.hist(bins = 25)
print()
data1.UnitPrice.skew()
#Highly skewed scaling needs to done before applying any regression model

In [None]:
corr_matrix = data1.corr(method='pearson')
corr_matrix

In [None]:
#corrleation with area
corr_matrix.UnitPrice.sort_values(ascending=False)

In [None]:
#Take targate variable into y
y = data1['UnitPrice']
X = data1.drop('UnitPrice',axis = 1)

from sklearn.preprocessing import PowerTransformer
y = y.values.reshape(-1,1)
# power transform the raw data
power = PowerTransformer(method='yeo-johnson', standardize=True)
y = power.fit_transform(y)

In [None]:
# Split data into train and test format
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=None)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score


import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.pipeline import Pipeline
pipelines = []
pipelines.append(('ScaledLR', Pipeline([('Scaler', MinMaxScaler()),('LR',LinearRegression())])))
pipelines.append(('ScaledLASSO', Pipeline([('Scaler', MinMaxScaler()),('LASSO', Lasso())])))
#pipelines.append(('ScaledEN', Pipeline([('Scaler', MinMaxScaler()),('EN', ElasticNet())])))
pipelines.append(('ScaledDT', Pipeline([('Scaler', MinMaxScaler()),('DT', DecisionTreeRegressor())])))
pipelines.append(('ScaledRF', Pipeline([('Scaler', MinMaxScaler()),('RF', RandomForestRegressor())])))
pipelines.append(('ScaledET', Pipeline([('Scaler', MinMaxScaler()),('ET', ExtraTreesRegressor())])))
pipelines.append(('ScaledGBM', Pipeline([('Scaler', MinMaxScaler()),('GBM', GradientBoostingRegressor())])))
pipelines.append(('ScaledXGB', Pipeline([('Scaler', MinMaxScaler()),('XGB', XGBRegressor())])))
#pipelines.append(('ScaledNN', Pipeline([('Scaler', MinMaxScaler()),('NN', MLPRegressor())])))
#pipelines.append(('ScaledSVR', Pipeline([('Scaler', MinMaxScaler()),('SVR', SVR(kernel='rbf'))])))
#pipelines.append(('ScaledKNN', Pipeline([('Scaler', MinMaxScaler()),('KNN', KNeighborsRegressor())])))

results = []
names = []
for name, model in pipelines:
    kfold = KFold(n_splits=10, random_state=10)
    cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='neg_mean_squared_error',n_jobs=-1)
    results.append(cv_results)
    names.append(name)
    msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
    print(msg)

In [None]:
ScaledLR: -0.987738 (0.006656)
ScaledLASSO: -0.999019 (0.005722)
ScaledEN: -0.999019 (0.005722)
ScaledDT: -0.056389 (0.003337)
ScaledRF: -0.032720 (0.001946)
ScaledET: -0.015866 (0.000895)
ScaledGBM: -0.526374 (0.006337)
ScaledXGB: -0.120653 (0.002417

In [None]:
# ScaledET is best performing

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
ET_model = ExtraTreesRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=25, n_repeats=25, random_state=2)
n_scores = cross_val_score(model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise',verbose=1)
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
ET_model.fit(X,y)

In [None]:
k_max = max(n_scores)
print( "At K = {}, Max Accuracy = {}".format(k_max, max(n_scores)*100))

In [None]:
from sklearn.metrics import mean_squared_error
predictions = ET_model.predict(X_test)
MSE = mean_squared_error(y_test , predictions)
print('ExtraTrees validation MAE = ',MSE)

In [None]:
ExtraTrees validation MAE =  0.00018089177342840292

In [None]:
# Decition Tree

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
# define the model
DT_model = DecisionTreeRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=30, n_repeats=10, random_state=21)
n_scores = cross_val_score(DT_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise',verbose=1)
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
DT_model.fit(X,y)

In [None]:
from sklearn.metrics import mean_squared_error
predictions = DT_model.predict(X_test)
MSE = mean_squared_error(y_test , predictions)
print('ExtraTrees validation MAE = ',MSE)

In [None]:
# Random Forest

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
# define the model
RF_model = RandomForestRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=50, n_repeats=50, random_state=50)
n_scores = cross_val_score(RF_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise',verbose=1)
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))


In [None]:
RF_model.fit(X,y)

In [None]:
from sklearn.metrics import mean_squared_error
predictions = RF_model.predict(X_test)
MSE = mean_squared_error(y_test , predictions)
print('ExtraTrees validation MAE = ',MSE)

In [None]:
# XGBoost

In [None]:
from numpy import mean
from numpy import std
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
# define the model
XGB_model = XGBRegressor()
# evaluate the model
cv = RepeatedKFold(n_splits=10, n_repeats=10, random_state=1)
n_scores = cross_val_score(XGB_model, X_train, y_train, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1, error_score='raise',verbose=1)
# report performance
print('MAE: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))

In [None]:
XGB_model.fit(X,y)

In [None]:
from sklearn.metrics import mean_squared_error
predictions = XGB_model.predict(X_test)
MSE = mean_squared_error(y_test , predictions)
print('ExtraTrees validation MAE = ',MSE)

In [None]:
# Using K_Nearest neighbour regressor
# running for different K values to know which yields the max accuracy.
from sklearn.neighbors import KNeighborsRegressor
score = []
for k in range(1,20):    
    clf = KNeighborsRegressor(n_neighbors = k,  weights = 'distance', p=1)
    clf.fit(X_train, y_train)
    score.append(clf.score(X_test, y_test))    

In [None]:
k_max = score.index(max(score))+1
print( "At K = {}, Max Accuracy = {}".format(k_max, max(score)*100))

In [None]:
clf = KNeighborsRegressor(n_neighbors = k_max,  weights = 'distance', p=1)
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test ))   
y_pred = clf.predict(X_test)

In [None]:
import joblib
# save the model to disk
filename = 'KNN_Regressor_MachineHack.sav'
joblib.dump(model, filename)

In [None]:
#Read testing data
df_test = pd.read_csv("/kaggle/input/retailcleanfull/Test_clean.csv")
df_test.head()

In [None]:
data_test = df_test.drop(['AM/PM_PM_sin','AM/PM_PM_cos','InvoiceNo','CustomerID','Time_sin','Time_cos','Year_cos'],1, inplace= False)
data_test.head()

In [None]:
X.head()

In [None]:
# apply the whole pipeline to data
results = DT_model.predict(data_test)

In [None]:
results

In [None]:
result = pd.DataFrame(data=results)
result.head()

In [None]:
# Applying inverse power transform to get back original form of unit price
a_inverse_transformed = power.inverse_transform(result)
a_inverse_transformed

In [None]:
result = pd.DataFrame(data=a_inverse_transformed).round(2)
result.head()

In [None]:
result=result.rename(columns={0:'UnitPrice'})

In [None]:
result.head()

In [None]:
result.to_csv("my_submission_file.csv",index=False)