In [68]:
import pandas as pd
import numpy as np
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from pylab import rcParams
import seaborn as sb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR


In [69]:
input_folder_path= r"D:\Masters\SJSU\Semester 2\256\Project\GIT\aravinda_regression\Walmart--Search-Optimization-and-recommendations"
filename = "/WalmartDatasset.txt"

search_string_counts = {}
highfreq_searchStrings = []


header_names=["term","product_id", "language","product_impressions","Product_clicks","cart_adds","cart_start","checkout","order"]
essential_features = ['term', 'product_id', 'product_impressions', 'Product_clicks', 'cart_adds', 'checkout', 'order']
columns_all = ['searchTerm', 'product_id', 'product_impressions', 'Product_clicks', 'cart_adds', 'order', 'CTR', 'ATR','conv']
column_scores = ['searchTerm','CTR','ATR','conv','score','Model']

models = ['Linear','RandomForest']

df = pd.read_csv(input_folder_path+filename, sep='\t', names = header_names)
df_scores = pd.DataFrame(columns=column_scores)

location = 0

In [70]:
def filter_data_by_language_and_features(language,dataframe,feature_list):
    dataframe = dataframe[dataframe['language'] == language]
    return (dataframe[feature_list])
    

In [71]:
def compute_ATR_CTR_Conv(dataframe):
    print('Computing ATR, CTR and Conv...')
    ATR = dataframe['cart_adds'] / dataframe['product_impressions']
    CTR = dataframe['Product_clicks'] / dataframe['product_impressions']
    Conv = dataframe['order'] / dataframe['product_impressions']
    return ATR,CTR,Conv
    

In [72]:
def merge_dataframe_with_computed_values(dataframe_english,columns):
    dataframe_english['searchTerm'] = dataframe_english['term']
    dataframe_english['ATR'],dataframe_english['CTR'],dataframe_english['conv'] = compute_ATR_CTR_Conv(dataframe = dataframe_english)
    print('Merging Dataframe with ATR, CTR and Conv..')
    dataframe_english = dataframe_english[list(columns)].reset_index()
    final_merged_dataset = dataframe_english[list(columns)]
    print('Merging Completed')
    print(final_merged_dataset.info())
    return final_merged_dataset

In [73]:
def replace_NAN_with_zero(dataframe):
    print('Replaced NAN with zeros..')
    return dataframe.replace(np.nan, 0)

In [74]:
def remove_duplicates(dataframe):
    print('Duplicates are Dropped..')
    return dataframe.drop_duplicates()

In [75]:
def calculate_search_string_occurrences(dataframe):
    for i in dataframe['searchTerm']:
        if i in search_string_counts:
            search_string_counts[i]+=1
        else:
            search_string_counts[i]=1

In [76]:
def compute_high_frequency_search_strings(frequency):
    print('Computing number of search strings with occurrences greater than ',frequency)
    for j in search_string_counts:
        if search_string_counts[j] > frequency:
            highfreq_searchStrings.append(j)
    print('Total number of high frequency search strings are ',len(highfreq_searchStrings))
    print(highfreq_searchStrings)

In [77]:
def compute_ATR_CTR_metrics(dataframe,search_string):
    ATRstd,CTRstd,CONVstd = dataframe[['ATR','CTR','conv']].std()
    ATRmean,CTRmean,CONVmean = dataframe[['ATR','CTR','conv']].mean()
    print('ATR mean for search string',search_string,'is',ATRmean)
    print('ATR standard deviation for search string',search_string,'is',ATRstd)
    print('CTR mean for search string',search_string,'is',CTRmean)
    print('CTR standard deviation for search string',search_string,'is',CTRstd)
    return ATRmean,ATRstd,CTRmean,CTRstd,CONVmean,CONVstd

In [78]:
def filter_outliers(dfsearch,ATR_mean,ATR_std,CTR_mean,CTR_std,CONV_mean,CONV_std):
    dfsearchWoOutliers = dfsearch[((dfsearch['ATR']<=(ATR_mean+2*ATR_std))& (dfsearch['ATR']>=(ATR_mean-1*ATR_std)))]
    dfsearchWoOutliers = dfsearchWoOutliers[((dfsearch['CTR']<=(CTR_mean+2*CTR_std))& (dfsearch['CTR']>=(CTR_mean-1*CTR_std)))]
    dfsearchWoOutliers = dfsearchWoOutliers[((dfsearch['conv']<=(CONVmean+2*CONVstd))& (dfsearch['conv']>=(CONVmean-1*CONVstd)))]
    return dfsearchWoOutliers

In [79]:
def plot_3d(dataframe):
    rcParams['figure.figsize'] = 5,5
    sb.set_style('whitegrid')
    #plt.hist(dfsearch['CTR'])
    #plt.plot()
    sb.pairplot(dataframe,vars=['CTR','ATR','conv'], y_vars=['conv'])
    dataframe.plot(kind='scatter',x='ATR',y='conv',c=['darkgray'],s=150)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(dataframe['CTR'], dataframe['ATR'], dataframe['conv'], c='r', marker='o')
    ax.set_xlabel('CTR')
    ax.set_ylabel('ATR')
    ax.set_zlabel('CONV')
    plt.show()


In [80]:
def test_train_split(dataframe,attr1,attr2,testSize):
    return train_test_split(dataframe[[attr1]],dataframe[[attr2]],test_size=int(testSize))

In [81]:
def perform_Regression_Analysis(CTR,ATR,CONV,dataframe):
    if(CTR & ATR & CONV):
        Regression('MultipleLinear')
    for i in models:
        Regression(i)
    

In [82]:
def Linear_Regression(dataframe,attribute1,attribute2,testSize,string):
    regressor = LinearRegression()
    
    if 'CTR' not in [str(attribute1),str(attribute2)]:
        X_train, X_test, y_train, y_test = train_test_split(dataframe[['ATR']],dataframe[['conv']],test_size=0.3)
        regressor.fit(X_train, y_train)
       # print('dataframe[searchTerm]',dataframe['searchTerm'])
        print(regressor.score(X_test, y_test))
        row = [string,False,True,True,regressor.score(X_test, y_test),'LinearRegression']
        
    elif 'ATR' not in [str(attribute1),str(attribute2)]:
        X_train, X_test, y_train, y_test = train_test_split(dataframe[['CTR']],dataframe[['conv']],test_size=0.3)
        regressor.fit(X_train, y_train)
      #  print('dataframe[searchTerm]',dataframe['searchTerm'])
        print(regressor.score(X_test, y_test))
        row = [string,True,False,True,regressor.score(X_test, y_test),'LinearRegression']
        
    else:
        X_train, X_test, y_train, y_test = train_test_split(dataframe[['CTR']],dataframe[['ATR']],test_size=0.3)
        regressor.fit(X_train, y_train)
        print('dataframe[searchTerm]',dataframe['searchTerm'])
        print(regressor.score(X_test, y_test))
        row = [string,True,True,False,regressor.score(X_test, y_test),'LinearRegression']
        
    location = len(df_scores) +1
    df_scores.loc[location] =row
    
    
                                                                                                         

In [83]:
def Multiple_Linear_Regression(dataframe,testSize,string):
    x = dataframe.iloc[:, 6:-1].values
    y = dataframe.iloc[:, 8].values
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    row = [string,True,True,True,regressor.score(X_test, y_test),'MultipleLinearRegression']   
    location = len(df_scores) +1
    df_scores.loc[location] =row
    

In [84]:
def Support_Vector_Regression(dataframe,testSize,string):
    X = dataframe.iloc[:, 6:-1].values
    y = dataframe.iloc[:, 8].values
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    X = sc_X.fit_transform(X)
    y = sc_y.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train, y_train)
    row = [string,True,True,True,regressor.score(X_test, y_test),'SupportVectorRegression']   
    location = len(df_scores) +1
    df_scores.loc[location] =row
    

In [85]:
df_english = filter_data_by_language_and_features(language = 'English',dataframe = df,feature_list = essential_features)
final_merged_dataset = merge_dataframe_with_computed_values (dataframe_english = df_english,columns =columns_all)
final_merged_dataset = replace_NAN_with_zero(final_merged_dataset)
final_merged_dataset = remove_duplicates(final_merged_dataset)
print(final_merged_dataset.info())

df_for_evaluation = final_merged_dataset[final_merged_dataset['conv'] >0]
calculate_search_string_occurrences(df_for_evaluation)
print('Total number of search strings are', len(search_string_counts))
compute_high_frequency_search_strings(frequency = 50)  

Computing ATR, CTR and Conv...
Merging Dataframe with ATR, CTR and Conv..
Merging Completed
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330548 entries, 0 to 1330547
Data columns (total 9 columns):
searchTerm             1329176 non-null object
product_id             1330548 non-null int64
product_impressions    1330548 non-null int64
Product_clicks         1330548 non-null int64
cart_adds              1330548 non-null int64
order                  1330548 non-null int64
CTR                    1330548 non-null float64
ATR                    1330548 non-null float64
conv                   1330548 non-null float64
dtypes: float64(3), int64(5), object(1)
memory usage: 91.4+ MB
None
Replaced NAN with zeros..
Duplicates are Dropped..
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1330548 entries, 0 to 1330547
Data columns (total 9 columns):
searchTerm             1330548 non-null object
product_id             1330548 non-null int64
product_impressions    1330548 non-null int64
Produ

In [86]:
highfreq_searchStrings = [x for x in highfreq_searchStrings if not any(c.isdigit() for c in str(x))]
#print(highfreq_searchStrings)
for i in highfreq_searchStrings:
    df_search = df_for_evaluation[df_for_evaluation['searchTerm']==str(i)]
    ATRmean,ATRstd,CTRmean,CTRstd,CONVmean,CONVstd = compute_ATR_CTR_metrics(dataframe = df_search,search_string = str(i))
    dfsearchWoOutliers = filter_outliers(dfsearch = df_search, ATR_mean = ATRmean, ATR_std = ATRstd, CTR_mean = CTRmean, CTR_std = CTRstd,CONV_mean = CONVmean, CONV_std = CONVstd)
    #plot_3d(dfsearchWoOutliers)
    Linear_Regression(dataframe = dfsearchWoOutliers,attribute1 = 'CTR',attribute2 = 'conv',testSize = 0.2,string = str(i))
    Linear_Regression(dataframe = dfsearchWoOutliers,attribute1 = 'ATR',attribute2 = 'conv',testSize = 0.2,string = str(i))
    Multiple_Linear_Regression(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i))
    Support_Vector_Regression(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i))
    
    
    
    

ATR mean for search string crackers is 0.0286980991334
ATR standard deviation for search string crackers is 0.0379017715625
CTR mean for search string crackers is 0.0548879343713
CTR standard deviation for search string crackers is 0.0960281065826
-0.390299209552
0.0695974488648
ATR mean for search string baby food is 0.0474781570204
ATR standard deviation for search string baby food is 0.0654319433583
CTR mean for search string baby food is 0.0396878497539
CTR standard deviation for search string baby food is 0.0357953063185
0.630139041651
0.433951202242
ATR mean for search string cat food is 0.11742467563
ATR standard deviation for search string cat food is 0.329335850633
CTR mean for search string cat food is 0.0377344943598
CTR standard deviation for search string cat food is 0.0755841345146
0.0269116703701
0.089404633915
ATR mean for search string chips is 0.0281323594411
ATR standard deviation for search string 

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


chips is 0.0471042310895
CTR mean for search string chips is 0.0341549839502
CTR standard deviation for search string chips is 0.0308967803443
-7.5496737368
-9.53848805834
ATR mean for search string gluten free is 0.0331128037838
ATR standard deviation for search string gluten free is 0.0408392597053
CTR mean for search string gluten free is 0.0537090404942
CTR standard deviation for search string gluten free is 0.0554631072552
0.496614954916
0.516919010701
ATR mean for search string chocolate is 0.0301157473582
ATR standard deviation for search string chocolate is 0.0719330822157
CTR mean for search string chocolate is 0.0317547614108
CTR standard deviation for search string chocolate is 0.0424988438685
-0.864818334879
-3.31837032823
ATR mean for search string camping is 0.00319855555315
ATR standard deviation for search string camping is 0.00466198164097
CTR mean for search string camping is 0.0112879470414
CTR standard deviation for search string camping 



is 0.0153765561468
0.464071339602
-0.0167170290311
ATR mean for search string great value is 0.0934193879932
ATR standard deviation for search string great value is 0.120393421806
CTR mean for search string great value is 0.109826145996
CTR standard deviation for search string great value is 0.143904231287
-3.46774727633
0.5704655209
ATR mean for search string candy is 0.0168401056233
ATR standard deviation for search string candy is 0.0173565183389
CTR mean for search string candy is 0.0230518935515
CTR standard deviation for search string candy is 0.0243518044355
-0.216043244243
-0.143473758395
ATR mean for search string cookies is 0.0515233077305
ATR standard deviation for search string cookies is 0.127809739231
CTR mean for search string cookies is 0.0416100526983
CTR standard deviation for search string cookies is 0.0508054178516
-0.433068670711
-7.07606081266
ATR mean for search string shampoo is 0.117582769109
ATR standard deviation for search string shampoo is 0.734086035253
CT



is 0.197197111955
CTR mean for search string coffee is 0.0431565805931
CTR standard deviation for search string coffee is 0.133147245606
-0.150646069308
0.933881598212
ATR mean for search string barbie is 0.0227473013822
ATR standard deviation for search string barbie is 0.0922019086794
CTR mean for search string barbie is 0.0326837455329
CTR standard deviation for search string barbie is 0.0920449632576
0.809958915737
-14.8195997431
ATR mean for search string organic is 0.0842322379899
ATR standard deviation for search string organic is 0.187784667461
CTR mean for search string organic is 0.0692481883962
CTR standard deviation for search string organic is 0.0730984790428
0.140309210668
-21.6103143745
ATR mean for search string food is 0.300814740823
ATR standard deviation for search string food is 0.659010879093
CTR mean for search string food is 0.196106683418
CTR standard deviation for search string food is 0.309958037668
0.867552784645
0.775826705441
ATR mean for search string scho



In [None]:
def sample(a,b,c):
    if(a & b & c):
        print('true')
        
sample(a = True,b = True, c = True)

In [None]:
column_scores = ['searchTerm','CTR','ATR','conv','Accuracy','Model']
df_scores = pd.DataFrame(columns=column_scores)
df_scores['searchTerm'] = 'cat'

In [None]:
df_scores.add(['dog',True,True,False,0.85])

In [None]:
df_scores

In [89]:
df_scores[(df_scores['score']>0.5)]

Unnamed: 0,searchTerm,CTR,ATR,conv,score,Model
5,baby food,True,False,True,0.630139,LinearRegression
8,baby food,True,True,True,0.587663,SupportVectorRegression
12,cat food,True,True,True,0.693778,SupportVectorRegression
18,gluten free,False,True,True,0.516919,LinearRegression
30,great value,False,True,True,0.570466,LinearRegression
31,great value,True,True,True,0.58257,MultipleLinearRegression
32,great value,True,True,True,0.759431,SupportVectorRegression
39,cookies,True,True,True,0.689207,MultipleLinearRegression
40,cookies,True,True,True,0.975649,SupportVectorRegression
46,coffee,False,True,True,0.933882,LinearRegression
