In [1]:
import pandas as pd
import numpy as np
import pickle
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
from pandas.tools.plotting import scatter_matrix
from pylab import rcParams
import seaborn as sb
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVR
from sklearn.externals import joblib
import datetime

In [2]:
input_folder_path= r"D:\Masters\SJSU\Semester 2\256\Project\GIT\aravinda_regression\Walmart--Search-Optimization-and-recommendations"
filename = "/WalmartDatasset.txt"

search_string_counts = {}
highfreq_searchStrings = []
model_map = {}
search_string_model_map = {}


header_names=["term","product_id", "language","product_impressions","Product_clicks","cart_adds","cart_start","checkout","order"]
essential_features = ['term', 'product_id', 'product_impressions', 'Product_clicks', 'cart_adds', 'checkout', 'order']
columns_all = ['searchTerm', 'product_id', 'product_impressions', 'Product_clicks', 'cart_adds', 'order', 'CTR', 'ATR','conv']
column_scores = ['searchTerm','CTR','ATR','conv','score','ModelName','kfold']#'Model','Xtest','Ytest']

models = ['Linear','RandomForest']

df = pd.read_csv(input_folder_path+filename, sep='\t', names = header_names)
df_scores = pd.DataFrame(columns=column_scores)

location = 0

In [3]:
def filter_data_by_language_and_features(language,dataframe,feature_list):
    dataframe = dataframe[dataframe['language'] == language]
    return (dataframe[feature_list])
    

In [4]:
def compute_ATR_CTR_Conv(dataframe):
    print('Computing ATR, CTR and Conv...')
    ATR = dataframe['cart_adds'] / dataframe['product_impressions']
    CTR = dataframe['Product_clicks'] / dataframe['product_impressions']
    Conv = dataframe['order'] / dataframe['product_impressions']
    return ATR,CTR,Conv
    

In [5]:
def merge_dataframe_with_computed_values(dataframe_english,columns):
    dataframe_english['searchTerm'] = dataframe_english['term']
    dataframe_english['ATR'],dataframe_english['CTR'],dataframe_english['conv'] = compute_ATR_CTR_Conv(dataframe = dataframe_english)
    print('Merging Dataframe with ATR, CTR and Conv..')
    dataframe_english = dataframe_english[list(columns)].reset_index()
    final_merged_dataset = dataframe_english[list(columns)]
    print('Merging Completed')
    print(final_merged_dataset.info())
    return final_merged_dataset

In [6]:
def replace_NAN_with_zero(dataframe):
    print('Replaced NAN with zeros..')
    return dataframe.replace(np.nan, 0)

In [7]:
def remove_duplicates(dataframe):
    print('Duplicates are Dropped..')
    return dataframe.drop_duplicates()

In [8]:
def calculate_search_string_occurrences(dataframe):
    for i in dataframe['searchTerm']:
        if i in search_string_counts:
            search_string_counts[i]+=1
        else:
            search_string_counts[i]=1

In [9]:
def compute_high_frequency_search_strings(frequency):
    print('Computing number of search strings with occurrences greater than ',frequency)
    for j in search_string_counts:
        if search_string_counts[j] > frequency:
            highfreq_searchStrings.append(j)
    print('Total number of high frequency search strings are ',len(highfreq_searchStrings))
    print(highfreq_searchStrings)

In [10]:
def compute_ATR_CTR_metrics(dataframe,search_string):
    ATRstd,CTRstd,CONVstd = dataframe[['ATR','CTR','conv']].std()
    ATRmean,CTRmean,CONVmean = dataframe[['ATR','CTR','conv']].mean()
    print('ATR mean for search string',search_string,'is',ATRmean)
    print('ATR standard deviation for search string',search_string,'is',ATRstd)
    print('CTR mean for search string',search_string,'is',CTRmean)
    print('CTR standard deviation for search string',search_string,'is',CTRstd)
    return ATRmean,ATRstd,CTRmean,CTRstd,CONVmean,CONVstd

In [11]:
def filter_outliers(dfsearch,ATR_mean,ATR_std,CTR_mean,CTR_std,CONV_mean,CONV_std):
    dfsearchWoOutliers = dfsearch[((dfsearch['ATR']<=(ATR_mean+2*ATR_std))& (dfsearch['ATR']>=(ATR_mean-1*ATR_std)))]
    dfsearchWoOutliers = dfsearchWoOutliers[((dfsearch['CTR']<=(CTR_mean+2*CTR_std))& (dfsearch['CTR']>=(CTR_mean-1*CTR_std)))]
    dfsearchWoOutliers = dfsearchWoOutliers[((dfsearch['conv']<=(CONVmean+2*CONVstd))& (dfsearch['conv']>=(CONVmean-1*CONVstd)))]
    return dfsearchWoOutliers

In [12]:
def plot_3d(dataframe):
    rcParams['figure.figsize'] = 5,5
    sb.set_style('whitegrid')
    #plt.hist(dfsearch['CTR'])
    #plt.plot()
    sb.pairplot(dataframe,vars=['CTR','ATR','conv'], y_vars=['conv'])
    dataframe.plot(kind='scatter',x='ATR',y='conv',c=['darkgray'],s=150)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(dataframe['CTR'], dataframe['ATR'], dataframe['conv'], c='r', marker='o')
    ax.set_xlabel('CTR')
    ax.set_ylabel('ATR')
    ax.set_zlabel('CONV')
    plt.show()


In [13]:
def test_train_split(dataframe,attr1,attr2,testSize):
    return train_test_split(dataframe[[attr1]],dataframe[[attr2]],test_size=int(testSize))

In [14]:
def perform_Regression_Analysis(CTR,ATR,CONV,dataframe):
    if(CTR & ATR & CONV):
        Regression('MultipleLinear')
    for i in models:
        Regression(i)
    

In [15]:
def Linear_Regression(dataframe,attribute1,attribute2,testSize,string):
    regressor = LinearRegression()
    
    if 'CTR' not in [str(attribute1),str(attribute2)]:
        X_train, X_test, y_train, y_test = train_test_split(dataframe[['ATR']],dataframe[['conv']],test_size=0.2)
        x,y = dataframe[['ATR']],dataframe[['conv']]
        regressor.fit(x, y)
       # print('dataframe[searchTerm]',dataframe['searchTerm'])
        print(regressor.score(X_test, y_test))
        scores = cross_val_score(regressor, x, y, cv=10)
        row = [string,False,True,True,regressor.score(X_test, y_test),'Linear_Regression',scores.mean()]#regressor,X_test,y_test]
        
    elif 'ATR' not in [str(attribute1),str(attribute2)]:
        X_train, X_test, y_train, y_test = train_test_split(dataframe[['CTR']],dataframe[['conv']],test_size=0.2)
        x,y = dataframe[['CTR']],dataframe[['conv']]
        regressor.fit(x, y)
      #  print('dataframe[searchTerm]',dataframe['searchTerm'])
        print(regressor.score(X_test, y_test))
        scores = cross_val_score(regressor, x, y, cv=10)
        row = [string,True,False,True,regressor.score(X_test, y_test),'Linear_Regression',scores.mean()]#regressor,X_test,y_test]
        
    else:
        X_train, X_test, y_train, y_test = train_test_split(dataframe[['CTR']],dataframe[['ATR']],test_size=0.2)
        x,y = dataframe[['CTR']],dataframe[['ATR']]
        regressor.fit(x, y)
        print('dataframe[searchTerm]',dataframe['searchTerm'])
        print(regressor.score(X_test, y_test))
        scores = cross_val_score(regressor, x, y, cv=10)
        row = [string,True,True,False,regressor.score(X_test, y_test),'Linear_Regression',scores.mean()]#regressor,X_test,y_test]
        
    location = len(df_scores) +1
    df_scores.loc[location] =row
    search_string_model_map['Linear_Regression'] = regressor
    
    
                                                                                                         

In [16]:
def Multiple_Linear_Regression(dataframe,testSize,string):
    x = dataframe.iloc[:, 6:-1].values
    y = dataframe.iloc[:, 8].values
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    scores = cross_val_score(regressor, x, y, cv=10)
    row = [string,True,True,True,regressor.score(X_test, y_test),'Multiple_Linear_Regression',scores.mean()]#regressor,X_test,y_test]   
    location = len(df_scores) +1
    df_scores.loc[location] =row
    search_string_model_map['Multiple_Linear_Regression'] = regressor
    

In [17]:
def Support_Vector_Regression(dataframe,testSize,string):
    X = dataframe.iloc[:, 6:-1].values
    y = dataframe.iloc[:, 8].values
    sc_X = StandardScaler()
    sc_y = StandardScaler()
    X = sc_X.fit_transform(X)
    y = sc_y.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
    regressor = SVR(kernel='rbf')
    regressor.fit(X_train, y_train)
    scores = cross_val_score(regressor, X, y, cv=10)
    row = [string,True,True,True,regressor.score(X_test, y_test),'Support_Vector_Regression',scores.mean()]#regressor,X_test,y_test]   
    location = len(df_scores) +1
    df_scores.loc[location] =row
    search_string_model_map['Support_Vector_Regression'] = regressor
    

In [18]:
def Polynomial_Regression_degree(dataframe,testSize,string,poly_degree):
    features = PolynomialFeatures(degree=poly_degree)
    X_Poly = features.fit_transform(dataframe[['CTR','ATR']])
    y = dataframe[['conv']]
    X_train, X_test, y_train, y_test = train_test_split(X_Poly, y, test_size=0.2, random_state=0)
    regressor = LinearRegression()
    regressor.fit(X_Poly,y)
    scores = cross_val_score(regressor, X_Poly, y, cv=10)
    row = [string,True,True,True,regressor.score(X_test, y_test),'Polynomial_Regression_degree'+str(poly_degree),scores.mean()]#regressor,X_test,y_test]   
    location = len(df_scores) +1
    df_scores.loc[location] =row
    search_string_model_map['Polynomial_Regression_degree'+str(poly_degree)] = regressor

In [19]:
def KNN(dataframe,testSize,string,neighbors):
    x = dataframe[['CTR','ATR']]
    y = dataframe[['conv']]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    regressor = KNeighborsRegressor(n_neighbors=neighbors)
    regressor.fit(x,y)
    scores = cross_val_score(regressor, x, y, cv=10)
    row = [string,True,True,True,regressor.score(X_test, y_test),'KNN',scores.mean()]#regressor,X_test,y_test]   
    location = len(df_scores) +1
    df_scores.loc[location] =row
    search_string_model_map['KNN'] = regressor

In [20]:
def RandomForest(dataframe,testSize,string):
    x = dataframe[['CTR']]
    y = dataframe[['conv']]
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
    regressor = RandomForestRegressor()
    regressor.fit(x,y)
    scores = cross_val_score(regressor, x, y, cv=10)
    row = [string,True,False,True,regressor.score(X_test, y_test),'RandomForest',scores.mean()]#regressor,X_test,y_test]   
    location = len(df_scores) +1
    df_scores.loc[location] =row
    search_string_model_map['RandomForest'] = regressor

In [21]:
def find_best_fit(search_string):
    search_string_scores = df_scores[df_scores.searchTerm == search_string]
    best_fit_score = search_string_scores[search_string_scores.kfold == search_string_scores.kfold.max()]
    best_fit_model = search_string_model_map[((best_fit_score.ModelName).values)[0]]
    return best_fit_model,((best_fit_score.kfold).values)[0]

In [22]:
def save_best_fit_model(best_fit_model,search_string,r2score):
    modelfilename = search_string+'.sav'
    joblib.dump(model, modelfilename)
    model_map[search_string] = [modelfilename,r2score]
    return modelfilename

In [23]:
def save_model_map():
    now = datetime.datetime.now()
    modelmapfilename = 'savedmodelsmap_'+'.sav'
    joblib.dump(model_map, modelmapfilename)
    return modelmapfilename

In [24]:
def top10Recommendations(search_string,dataframe,model):
    prodmap = {}
    recomm = []
    predictedCTR = model.predict(dataframe[['CTR']])
    actualCTR = (dataframe.CTR).values
    products = (dataframe.product_id).values
    print(len(predictedCTR))
    print(len(actualCTR))
    for i in range(len(predictedCTR)):
        if(predictedCTR[i] > actualCTR[i]):
            actualCTR[i] = predictedCTR[i]
    for j in range(len(actualCTR)):
        prodmap[products[j]] = actualCTR[j]
    print(len(prodmap))   
    actualCTR.sort()

    for k in range(len(actualCTR)):
        for key, value in prodmap.items():
            if(actualCTR[k] == value):
                recomm.append(key)
    return recomm

    #loaded_model_map = joblib.load('savedmodelsmap_.sav')
    #search_string_model_pat = loaded_model_map[search_string][0]
    #search_string_model = joblib.load(search_string_model_pat)
#    predictedCTR = model.predict((dataframe.CTR).values)
 #   actualCTR = (dataframe.CTR).values
  #  comparray = np.greater(predictedCTR,actualCTR)
   # for i in range(len(comparray)):
    #    if(comparray[i]):
     #       actualCTR[i] = predictedCTR[i]
    #dataframe['AdjCTR'] = actualCTR.tolist()
   # print(dataframe)
    
 



    
    

In [25]:
df_english = filter_data_by_language_and_features(language = 'English',dataframe = df,feature_list = essential_features)
final_merged_dataset = merge_dataframe_with_computed_values (dataframe_english = df_english,columns =columns_all)
final_merged_dataset = replace_NAN_with_zero(final_merged_dataset)
final_merged_dataset = remove_duplicates(final_merged_dataset)
#print(final_merged_dataset.info())

df_for_evaluation = final_merged_dataset[final_merged_dataset['conv'] >0]
calculate_search_string_occurrences(df_for_evaluation)
print('Total number of search strings are', len(search_string_counts))
compute_high_frequency_search_strings(frequency = 50)  

Computing ATR, CTR and Conv...
Merging Dataframe with ATR, CTR and Conv..
Merging Completed
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1330548 entries, 0 to 1330547
Data columns (total 9 columns):
searchTerm             1329176 non-null object
product_id             1330548 non-null int64
product_impressions    1330548 non-null int64
Product_clicks         1330548 non-null int64
cart_adds              1330548 non-null int64
order                  1330548 non-null int64
CTR                    1330548 non-null float64
ATR                    1330548 non-null float64
conv                   1330548 non-null float64
dtypes: float64(3), int64(5), object(1)
memory usage: 91.4+ MB
None
Replaced NAN with zeros..
Duplicates are Dropped..
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1330548 entries, 0 to 1330547
Data columns (total 9 columns):
searchTerm             1330548 non-null object
product_id             1330548 non-null int64
product_impressions    1330548 non-null int64
Produ

In [26]:
highfreq_searchStrings = [x for x in highfreq_searchStrings if not any(c.isdigit() for c in str(x))]
#print(highfreq_searchStrings)
for i in highfreq_searchStrings:
    df_search = df_for_evaluation[df_for_evaluation['searchTerm']==str(i)]
    ATRmean,ATRstd,CTRmean,CTRstd,CONVmean,CONVstd = compute_ATR_CTR_metrics(dataframe = df_search,search_string = str(i))
    dfsearchWoOutliers = filter_outliers(dfsearch = df_search, ATR_mean = ATRmean, ATR_std = ATRstd, CTR_mean = CTRmean, CTR_std = CTRstd,CONV_mean = CONVmean, CONV_std = CONVstd)
    #plot_3d(dfsearchWoOutliers)
    Linear_Regression(dataframe = dfsearchWoOutliers,attribute1 = 'CTR',attribute2 = 'conv',testSize = 0.2,string = str(i))
    Linear_Regression(dataframe = dfsearchWoOutliers,attribute1 = 'ATR',attribute2 = 'conv',testSize = 0.2,string = str(i))
    Multiple_Linear_Regression(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i))
    Support_Vector_Regression(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i))
    Polynomial_Regression_degree(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i),poly_degree = 2)
    Polynomial_Regression_degree(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i),poly_degree = 3)
    KNN(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i),neighbors = 10)
    RandomForest(dataframe = dfsearchWoOutliers,testSize = 0.2,string = str(i))
    
    model1,r2 = find_best_fit(search_string = str(i))
    
    save_best_fit_model(best_fit_model = model1,search_string = str(i),r2score = r2)
    
    
    
save_model_map()

ATR mean for search string shampoo is 0.117582769109
ATR standard deviation for search string shampoo is 0.734086035253
CTR mean for search string shampoo is 0.0185532174273
CTR standard deviation for search string shampoo is 0.0400540809177
-0.979917950738
0.743714609335


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


ATR mean for search string organic is 0.0842322379899
ATR standard deviation for search string organic is 0.187784667461
CTR mean for search string organic is 0.0692481883962
CTR standard deviation for search string organic is 0.0730984790428
-0.173251603801
0.184217987476


  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


ATR mean for search string cat food is 0.11742467563
ATR standard deviation for search string cat food is 0.329335850633
CTR mean for search string cat food is 0.0377344943598
CTR standard deviation for search string cat food is 0.0755841345146
-0.0886185714946
-0.759185660439


  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


ATR mean for search string coffee is 0.0760145740502
ATR standard deviation for search string coffee is 0.197197111955
CTR mean for search string coffee is 0.0431565805931
CTR standard deviation for search string coffee is 0.133147245606
-0.073348983966
0.951021905754


  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string school supplies is 0.0338313728961
ATR standard deviation for search string school supplies is 0.045222650482
CTR mean for search string school supplies is 0.0201537957223
CTR standard deviation for search string school supplies is 0.0271111948188
0.083463890213
-0.107622736324


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


ATR mean for search string camping is 0.00319855555315
ATR standard deviation for search string camping is 0.00466198164097
CTR mean for search string camping is 0.0112879470414
CTR standard deviation for search string camping is 0.0153765561468
0.246644002856
0.00445076082244


  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


ATR mean for search string dog treats is 0.0328275905354
ATR standard deviation for search string dog treats is 0.0602535530672
CTR mean for search string dog treats is 0.0342643237548
CTR standard deviation for search string dog treats is 0.0369377697404
0.619112629309
-0.0103783335601


  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


ATR mean for search string paw patrol is 0.00686551711199
ATR standard deviation for search string paw patrol is 0.0100019895583
CTR mean for search string paw patrol is 0.014156869848
CTR standard deviation for search string paw patrol is 0.0144078346618
0.430931492711
0.198620927358


  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string baby food is 0.0474781570204
ATR standard deviation for search string baby food is 0.0654319433583
CTR mean for search string baby food is 0.0396878497539
CTR standard deviation for search string baby food is 0.0357953063185
0.725728812257
0.0863392341104


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string chips is 0.0281323594411
ATR standard deviation for search string chips is 0.0471042310895
CTR mean for search string chips is 0.0341549839502
CTR standard deviation for search string chips is 0.0308967803443
-6.96093424038
0.623477426031


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string chocolate is 0.0301157473582
ATR standard deviation for search string chocolate is 0.0719330822157
CTR mean for search string chocolate is 0.0317547614108
CTR standard deviation for search string chocolate is 0.0424988438685
-17.3105369985
-89.564473067


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string candy is 0.0168401056233
ATR standard deviation for search string candy is 0.0173565183389
CTR mean for search string candy is 0.0230518935515
CTR standard deviation for search string candy is 0.0243518044355
0.644535960791
-0.232272732814


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string great value is 0.0934193879932
ATR standard deviation for search string great value is 0.120393421806
CTR mean for search string great value is 0.109826145996
CTR standard deviation for search string great value is 0.143904231287
0.630032117711
-2.66972236206


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string gluten free is 0.0331128037838
ATR standard deviation for search string gluten free is 0.0408392597053
CTR mean for search string gluten free is 0.0537090404942
CTR standard deviation for search string gluten free is 0.0554631072552
0.904266455288
-1.37328499679


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string crackers is 0.0286980991334
ATR standard deviation for search string crackers is 0.0379017715625
CTR mean for search string crackers is 0.0548879343713
CTR standard deviation for search string crackers is 0.0960281065826
0.0246936538492
-14.7016499


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string cookies is 0.0515233077305
ATR standard deviation for search string cookies is 0.127809739231
CTR mean for search string cookies is 0.0416100526983
CTR standard deviation for search string cookies is 0.0508054178516
0.875554196867
-99.251482386


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.


ATR mean for search string gerber is 0.017144973368
ATR standard deviation for search string gerber is 0.0104602579671
CTR mean for search string gerber is 0.0186958607167
CTR standard deviation for search string gerber is 0.00776224168018
0.294295980379
0.121387277496


  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string cereal is 0.0400404199633
ATR standard deviation for search string cereal is 0.101056360091
CTR mean for search string cereal is 0.038581045133
CTR standard deviation for search string cereal is 0.0355027030253
0.854388833979
0.364385749305


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string food is 0.300814740823
ATR standard deviation for search string food is 0.659010879093
CTR mean for search string food is 0.196106683418
CTR standard deviation for search string food is 0.309958037668
0.548820585721
0.672136536417


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


ATR mean for search string barbie is 0.0227473013822
ATR standard deviation for search string barbie is 0.0922019086794
CTR mean for search string barbie is 0.0326837455329
CTR standard deviation for search string barbie is 0.0920449632576
0.141501250124
0.516615818849


  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


'savedmodelsmap_.sav'

In [None]:
type((df_scores[(df_scores.searchTerm == 'food') & (df_scores.kfold == df_scores.kfold.max())]['ModelName'].values)[0])

In [None]:
search_string_scores = df_scores[df_scores.searchTerm == 'food']
best_fit_score = search_string_scores[search_string_scores.kfold == search_string_scores.kfold.max()]
best_fit_score.index
#best_fit_model = search_string_model_map[[str(best_fit_score.ModelName)]
#type(best_fit_score)



In [None]:
def sample(a,b,c):
    if(a & b & c):
        print('true')
        
sample(a = True,b = True, c = True)

In [None]:
column_scores = ['searchTerm','CTR','ATR','conv','Accuracy','Model']
df_scores = pd.DataFrame(columns=column_scores)
df_scores['searchTerm'] = 'cat'

In [None]:
(df_scores[(df_scores['kfold']>0.2)])


In [None]:
len(df_scores[(df_scores['score']>0.5)])

In [None]:
model = df_scores[(df_scores['searchTerm']=='cereal') & (df_scores['ModelName']=='SupportVectorRegression')]['Model']

In [None]:
regressor1 = LinearRegression()
regressor2 = RandomForestRegressor()
#type(regressor1)
type(regressor1)

In [None]:
sample = {}
sample['p'] = [regressor1,1]
sample['q'] = [regressor2,2]

In [None]:
sample

In [None]:
type(sample['p'][0])

In [None]:
model = sample['p'][0]
type(model)

In [None]:
df_crack = df_scores[df_scores.searchTerm == 'crackers']

df_crack = df_crack[df_crack.kfold == df_crack.kfold.max()]
(float(df_crack.kfold))
#df_crack['kfold'].max()

In [35]:
df_scores

Unnamed: 0,searchTerm,CTR,ATR,conv,score,ModelName,kfold
1,shampoo,True,False,True,-0.979918,Linear_Regression,-10.091650
2,shampoo,False,True,True,0.743715,Linear_Regression,-69.186994
3,shampoo,True,True,True,0.390455,Multiple_Linear_Regression,-22.819078
4,shampoo,True,True,True,0.092430,Support_Vector_Regression,-3.293939
5,shampoo,True,True,True,0.993051,Polynomial_Regression_degree2,-26.528780
6,shampoo,True,True,True,0.994977,Polynomial_Regression_degree3,-1705.008296
7,shampoo,True,True,True,0.160684,KNN,-3.491335
8,shampoo,True,False,True,0.975192,RandomForest,-3.477302
9,organic,True,False,True,-0.173252,Linear_Regression,-0.027093
10,organic,False,True,True,0.184218,Linear_Regression,-0.177520


In [None]:
import datetime

now = datetime.datetime.now()
type(now.strftime("%Y-%m-%d %H:%M %S"))

In [None]:
type(now)

In [None]:
str(now)

In [None]:
search_string_model_map

In [None]:
from sklearn.externals import joblib

loaded_model = joblib.load('savedmodelsmap_.sav')

In [None]:
search_string_model_pat = loaded_model['great value'][0]
search_string_model = joblib.load(search_string_model_pat)

In [None]:
now = datetime.datetime.now()
modelmapfilename = 'savedmodels_'+str(now.strftime("%Y-%m-%d-%H:%M"))+'.sav'
joblib.dump(model_map, modelmapfilename)

In [None]:
(loaded_model['great value'])

In [47]:
#def top10Recommendations(search_string,dataframe,model):
    #loaded_model_map = joblib.load('savedmodelsmap_.sav')
    #search_string_model_pat = loaded_model_map[search_string][0]
    #search_string_model = joblib.load(search_string_model_pat)
prodmap = {}
recomm = []
model = LinearRegression()
model.fit(dfsearchWoOutliers[['CTR']],dfsearchWoOutliers[['conv']])
predictedCTR = model.predict(dfsearchWoOutliers[['CTR']])
actualCTR = (dfsearchWoOutliers.CTR).values
products = (dfsearchWoOutliers.product_id).values
print(len(predictedCTR))
print(len(actualCTR))
for i in range(len(predictedCTR)):
    if(predictedCTR[i] > actualCTR[i]):
        actualCTR[i] = predictedCTR[i]
for j in range(len(actualCTR)):
    prodmap[products[j]] = actualCTR[j]
print(len(prodmap))   
actualCTR.sort()

for k in range(len(actualCTR)):
    for key, value in prodmap.items():
        if(actualCTR[k] == value):
            recomm.append(key)
(recomm)
 


    
    
    
    
 #   model = LinearRegression()
#    model.fit(dfsearchWoOutliers[['CTR']],dfsearchWoOutliers[['conv']])

50
50
50


[6000196229284,
 6000196125227,
 6000196393051,
 6000196122676,
 6000196322187,
 6000196392597,
 6000196392627,
 6000195575668,
 6000197293688,
 6000195382904,
 6000196392657,
 6000196393054,
 6000196322193,
 6000196347925,
 6000196322151,
 6000196392482,
 6000196037848,
 6000196393036,
 6000196392660,
 6000196392928,
 6000196122679,
 6000195558330,
 6000196322265,
 6000196392675,
 6000196094094,
 6000196601905,
 6000196322133,
 6000196392621,
 6000196399510,
 6000196094115,
 6000196943085,
 6000196392612,
 6000197292626,
 6000196392603,
 6000196392633,
 6000188920646,
 6000194335439,
 6000196228394,
 6000196228464,
 6000196228461,
 6000196122673,
 6000196392943,
 6000195382949,
 6000195575674,
 6000196392879,
 6000196392636,
 6000196229287,
 6000195382937,
 6000195382952,
 6000196322235]

In [None]:
comparray = np.greater(predictedCTR,actualCTR)
for i in range(len(comparray)):
    if(comparray[i]):
        actualCTR[i] = predictedCTR[i]
dfsearchWoOutliers['AdjCTR'] = actualCTR.tolist()
print(dfsearchWoOutliers)