In [1]:
import pandas as pd
import numpy as np
import imblearn
import scipy.stats as stats
from datetime import datetime 
from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_val_score

from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_recall_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline


imblearn.__version__

'0.10.1'

In [39]:
df = pd.read_csv("merged-normalized.csv")
df = df.drop(["Date"],axis=1)
df["Time"]=pd.to_datetime(df["Time"], format='%H:%M:%S').dt.hour
#df["Time"] = stats.zscore(df["Time"])
df.head()

Unnamed: 0,Time,Race,Gender,Body_Size,With_Kids,Kids_Category,Basket_Size,Basket_colour,Attire,Shirt_Colour,...,buyDrinks,TotalSpent_RM,latitude,longitude,Num_of_Baskets,tempmax,tempmin,humidity,description,icon
0,20,malay,male,moderate,yes,young,big,red,casual,blue,...,-0.306655,-1.175289,-1.974354,0.219731,-1.24844,-0.804993,0.954235,0.51219,Cloudy skies throughout the day.,cloudy
1,20,,male,thin,no,no_kids,big,,casual,white,...,0.873818,1.620044,-1.844641,0.072282,1.25697,-0.804993,0.954235,0.51219,Cloudy skies throughout the day.,cloudy
2,20,malay,female,moderate,no,no_kids,big,blue,casual,red,...,-0.306655,-0.243511,-1.678802,1.008022,-1.24844,-0.804993,0.954235,0.51219,Cloudy skies throughout the day.,cloudy
3,21,indian,male,thin,no,no_kids,,black,casual,black,...,-0.306655,0.455322,-1.495611,0.011978,-1.24844,-0.804993,0.954235,0.51219,Cloudy skies throughout the day.,cloudy
4,21,indian,male,,no,no_kids,big,blue,casual,blue,...,-0.896892,-0.243511,-2.086209,0.252582,1.25697,-0.804993,0.954235,0.51219,Cloudy skies throughout the day.,cloudy


In [43]:
#Label encode
label_encoder = preprocessing.LabelEncoder()
for col in df:
    if df[col].dtype == "object":
        df[col]=label_encoder.fit_transform(df[col])
df.head()

Unnamed: 0,Time,Race,Gender,Body_Size,With_Kids,Kids_Category,Basket_Size,Basket_colour,Attire,Shirt_Colour,...,buyDrinks,TotalSpent_RM,latitude,longitude,Num_of_Baskets,tempmax,tempmin,humidity,description,icon
0,20,3,1,1,1,4,0,8,0,2,...,-0.306655,-1.175289,-1.974354,0.219731,-1.24844,-0.804993,0.954235,0.51219,3,0
1,20,4,1,2,0,1,0,11,0,10,...,0.873818,1.620044,-1.844641,0.072282,1.25697,-0.804993,0.954235,0.51219,3,0
2,20,3,0,1,0,1,0,1,0,9,...,-0.306655,-0.243511,-1.678802,1.008022,-1.24844,-0.804993,0.954235,0.51219,3,0
3,21,2,1,2,0,1,2,0,0,0,...,-0.306655,0.455322,-1.495611,0.011978,-1.24844,-0.804993,0.954235,0.51219,3,0
4,21,2,1,3,0,1,0,1,0,2,...,-0.896892,-0.243511,-2.086209,0.252582,1.25697,-0.804993,0.954235,0.51219,3,0


In [118]:
X = df.drop(["TotalSpent_RM"],axis=1)
y = df["TotalSpent_RM"]
colnames = X.columns

In [152]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

In [163]:
lr = LinearRegression()
rfe = RFECV(lr, min_features_to_select=5, cv=3)

In [164]:
rfe.fit(X,y)

In [165]:
def ranking(ranks, names, order=1):
    minmax = MinMaxScaler() # everything will be between 0 and 1
    ranks = minmax.fit_transform(order*np.array([ranks]).T).T[0]
    ranks = map(lambda x: round(x,2), ranks)
    return dict(zip(names, ranks))

In [166]:
rfe_score = ranking(list(map(float, rfe.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)
rfe_score

Unnamed: 0,Features,Score
14,Spectacles,1.0
12,pants_type,1.0
2,Gender,1.0
22,Num_of_Baskets,1.0
5,Kids_Category,1.0
15,Age_Range,1.0
1,Race,1.0
10,shirt_type,0.95
20,latitude,0.9
19,buyDrinks,0.86


In [167]:
for i in range(X.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe.support_[i], rfe.ranking_[i]))


Column: 0, Selected False, Rank: 17.000
Column: 1, Selected True, Rank: 1.000
Column: 2, Selected True, Rank: 1.000
Column: 3, Selected False, Rank: 10.000
Column: 4, Selected False, Rank: 20.000
Column: 5, Selected True, Rank: 1.000
Column: 6, Selected False, Rank: 14.000
Column: 7, Selected False, Rank: 9.000
Column: 8, Selected False, Rank: 16.000
Column: 9, Selected False, Rank: 21.000
Column: 10, Selected False, Rank: 2.000
Column: 11, Selected False, Rank: 13.000
Column: 12, Selected True, Rank: 1.000
Column: 13, Selected False, Rank: 5.000
Column: 14, Selected True, Rank: 1.000
Column: 15, Selected True, Rank: 1.000
Column: 16, Selected False, Rank: 15.000
Column: 17, Selected False, Rank: 8.000
Column: 18, Selected False, Rank: 12.000
Column: 19, Selected False, Rank: 4.000
Column: 20, Selected False, Rank: 3.000
Column: 21, Selected False, Rank: 11.000
Column: 22, Selected True, Rank: 1.000
Column: 23, Selected False, Rank: 22.000
Column: 24, Selected False, Rank: 18.000
Colum

In [151]:
from sklearn.model_selection import ShuffleSplit

model = LinearRegression()
pipeline = Pipeline(steps=[('s',rfe),('m',model)])
# evaluate model
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
n_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: -0.999 (0.019)


In [168]:
lasso = Lasso()
rfe2 = RFECV(lasso, cv=3, min_features_to_select=5)

In [169]:
rfe2.fit(X,y)

In [170]:
rfe_score = ranking(list(map(float, rfe2.ranking_)), colnames, order=-1)
rfe_score = pd.DataFrame(list(rfe_score.items()), columns=['Features', 'Score'])
rfe_score = rfe_score.sort_values("Score", ascending = False)
rfe_score

Unnamed: 0,Features,Score
27,icon,1.0
26,description,1.0
25,humidity,1.0
24,tempmin,1.0
23,tempmax,1.0
22,Num_of_Baskets,0.96
21,longitude,0.91
20,latitude,0.87
19,buyDrinks,0.83
18,TimeSpent_minutes,0.78


In [171]:
for i in range(X.shape[1]):
 print('Column: %d, Selected %s, Rank: %.3f' % (i, rfe2.support_[i], rfe2.ranking_[i]))


Column: 0, Selected False, Rank: 24.000
Column: 1, Selected False, Rank: 23.000
Column: 2, Selected False, Rank: 22.000
Column: 3, Selected False, Rank: 21.000
Column: 4, Selected False, Rank: 20.000
Column: 5, Selected False, Rank: 19.000
Column: 6, Selected False, Rank: 18.000
Column: 7, Selected False, Rank: 17.000
Column: 8, Selected False, Rank: 16.000
Column: 9, Selected False, Rank: 15.000
Column: 10, Selected False, Rank: 14.000
Column: 11, Selected False, Rank: 13.000
Column: 12, Selected False, Rank: 12.000
Column: 13, Selected False, Rank: 11.000
Column: 14, Selected False, Rank: 10.000
Column: 15, Selected False, Rank: 9.000
Column: 16, Selected False, Rank: 8.000
Column: 17, Selected False, Rank: 7.000
Column: 18, Selected False, Rank: 6.000
Column: 19, Selected False, Rank: 5.000
Column: 20, Selected False, Rank: 4.000
Column: 21, Selected False, Rank: 3.000
Column: 22, Selected False, Rank: 2.000
Column: 23, Selected True, Rank: 1.000
Column: 24, Selected True, Rank: 1.0

In [144]:
model2 = Lasso()
pipeline = Pipeline(steps=[('s',rfe2),('m',model2)])
# evaluate model
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
n_scores = cross_val_score(pipeline, X, y, cv=cv, scoring='neg_mean_squared_error',n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (np.mean(n_scores), np.std(n_scores)))

Accuracy: -0.998 (0.020)
