***

# Recommendation System

***

Importing libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances

***

## Datasets

History

In [2]:
history = pd.read_csv("data/history.csv")

In [3]:
history.head()

Unnamed: 0,userID,questionID,visits
0,12345,3,1
1,12345,8,1
2,12345,10,1
3,12345,12,2
4,12345,20,3


***

Questions

In [4]:
questions = pd.read_csv("data/questions.csv")
questions.head()

Unnamed: 0,questionID,Name
0,1,What is HPE Composable Ecosystem Program?
1,2,What VMware plug-ins are in the HPE OneView Ad...
2,3,How are the partner integrations licensed for ...
3,4,How is HPE OneView delivered?
4,5,Does HPE OneView replace the need for HPE Syst...


Ratings

***

In [5]:
history['visits'] = (history['visits'] - history['visits'].min()) / (history['visits'].max() - history['visits'].min())

In [6]:
userhistory = history.groupby(by='userID', as_index=False)['visits'].mean()

In [7]:
userhistory

Unnamed: 0,userID,visits
0,12345,0.166667
1,12346,0.214286
2,12347,0.090909
3,12348,0.0
4,12349,0.25


In [8]:
history_s = pd.merge(history, userhistory, on='userID')
history_s['norm_rating'] = history_s['visits_x'] - history_s['visits_y']

In [9]:
history_s.head()

Unnamed: 0,userID,questionID,visits_x,visits_y,norm_rating
0,12345,3,0.0,0.166667,-0.166667
1,12345,8,0.0,0.166667,-0.166667
2,12345,10,0.0,0.166667,-0.166667
3,12345,12,0.5,0.166667,0.333333
4,12345,20,1.0,0.166667,0.833333


In [10]:
temp = pd.pivot_table(history_s,values='visits_x',index='userID',columns='questionID')
temp.head()

questionID,0,1,2,3,4,5,6,7,8,10,...,14,15,16,17,18,19,20,21,22,23
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12345,,,,0.0,0.0,0.0,,,0.0,0.0,...,,,,,,,1.0,0.0,0.0,
12346,,,,,,,,,,,...,0.0,0.0,1.0,,,,0.0,,,
12347,,,,,,0.0,0.0,0.0,0.0,,...,0.0,0.5,0.5,0.0,0.0,,,,,
12348,,,,,,,,,,,...,,,,,0.0,0.0,0.0,0.0,,
12349,0.0,1.0,0.0,0.0,0.5,,,,,,...,,,,,,,,0.5,0.0,0.0


In [11]:
final = pd.pivot_table( history_s, values='norm_rating', index='userID',columns='questionID')
final.head()

questionID,0,1,2,3,4,5,6,7,8,10,...,14,15,16,17,18,19,20,21,22,23
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12345,,,,-0.166667,-0.166667,-0.166667,,,-0.166667,-0.166667,...,,,,,,,0.833333,-0.166667,-0.166667,
12346,,,,,,,,,,,...,-0.214286,-0.214286,0.785714,,,,-0.214286,,,
12347,,,,,,-0.090909,-0.090909,-0.090909,-0.090909,,...,-0.090909,0.409091,0.409091,-0.090909,-0.090909,,,,,
12348,,,,,,,,,,,...,,,,,0.0,0.0,0.0,0.0,,
12349,-0.25,0.75,-0.25,-0.25,0.25,,,,,,...,,,,,,,,0.25,-0.25,-0.25


In [12]:
#replacing by question
final_question = final.fillna(final.mean(axis=0))

#replacing by user average
final_user = final.apply(lambda row: row.fillna(row.mean()), axis=1)

In [13]:
final_question.head()

questionID,0,1,2,3,4,5,6,7,8,10,...,14,15,16,17,18,19,20,21,22,23
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12345,-0.25,0.75,-0.25,-0.166667,-0.166667,-0.166667,-0.090909,-0.090909,-0.166667,-0.166667,...,-0.152597,0.097403,0.597403,-0.090909,-0.045455,0.0,0.833333,-0.166667,-0.166667,-0.25
12346,-0.25,0.75,-0.25,-0.208333,0.041667,-0.128788,-0.090909,-0.090909,-0.128788,-0.166667,...,-0.214286,-0.214286,0.785714,-0.090909,-0.045455,0.0,-0.214286,0.027778,-0.208333,-0.25
12347,-0.25,0.75,-0.25,-0.208333,0.041667,-0.090909,-0.090909,-0.090909,-0.090909,-0.166667,...,-0.090909,0.409091,0.409091,-0.090909,-0.090909,0.0,0.206349,0.027778,-0.208333,-0.25
12348,-0.25,0.75,-0.25,-0.208333,0.041667,-0.128788,-0.090909,-0.090909,-0.128788,-0.166667,...,-0.152597,0.097403,0.597403,-0.090909,0.0,0.0,0.0,0.0,-0.208333,-0.25
12349,-0.25,0.75,-0.25,-0.25,0.25,-0.128788,-0.090909,-0.090909,-0.128788,-0.166667,...,-0.152597,0.097403,0.597403,-0.090909,-0.045455,0.0,0.206349,0.25,-0.25,-0.25


In [14]:
final_user.head()

questionID,0,1,2,3,4,5,6,7,8,10,...,14,15,16,17,18,19,20,21,22,23
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12345,1.850372e-17,1.850372e-17,1.850372e-17,-0.1666667,-0.1666667,-0.1666667,1.850372e-17,1.850372e-17,-0.1666667,-0.1666667,...,1.850372e-17,1.850372e-17,1.850372e-17,1.850372e-17,1.850372e-17,1.850372e-17,0.8333333,-0.1666667,-0.1666667,1.850372e-17
12346,3.965082e-18,3.965082e-18,3.965082e-18,3.965082e-18,3.965082e-18,3.965082e-18,3.965082e-18,3.965082e-18,3.965082e-18,3.965082e-18,...,-0.2142857,-0.2142857,0.7857143,3.965082e-18,3.965082e-18,3.965082e-18,-0.2142857,3.965082e-18,3.965082e-18,3.965082e-18
12347,-1.51394e-17,-1.51394e-17,-1.51394e-17,-1.51394e-17,-1.51394e-17,-0.09090909,-0.09090909,-0.09090909,-0.09090909,-1.51394e-17,...,-0.09090909,0.4090909,0.4090909,-0.09090909,-0.09090909,-1.51394e-17,-1.51394e-17,-1.51394e-17,-1.51394e-17,-1.51394e-17
12348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12349,-0.25,0.75,-0.25,-0.25,0.25,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.25,-0.25,-0.25


In [15]:
# user similarity on final_user
b = cosine_similarity(final_user)
np.fill_diagonal(b, 0)
similarity_with_user = pd.DataFrame(b,index=final_user.index)
similarity_with_user.columns=final_user.index
similarity_with_user.head()

userID,12345,12346,12347,12348,12349
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12345,0.0,-0.259437,6.809235e-19,0.0,2.312965e-18
12346,-0.2594373,0.0,0.4741047,0.0,0.0
12347,6.809235e-19,0.474105,0.0,0.0,0.0
12348,0.0,0.0,0.0,0.0,0.0
12349,2.312965e-18,0.0,0.0,0.0,0.0


In [16]:
# user similarity on final_shop
cosine = cosine_similarity(final_question)
np.fill_diagonal(cosine, 0 )
similarity_with_question = pd.DataFrame(cosine,index=final_question.index)
similarity_with_question.columns=final_user.index
similarity_with_question.head()

userID,12345,12346,12347,12348,12349
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12345,0.0,0.609323,0.801872,0.782997,0.789708
12346,0.609323,0.0,0.774824,0.93537,0.868766
12347,0.801872,0.774824,0.0,0.927392,0.919419
12348,0.782997,0.93537,0.927392,0.0,0.951586
12349,0.789708,0.868766,0.919419,0.951586,0.0


In [17]:
def n_neighbours(df,n):
    order = np.argsort(df.values, axis=1)[:, :n]
    df = df.apply(lambda x: pd.Series(x.sort_values(ascending=False)
           .iloc[:n].index, 
          index=['top{}'.format(i) for i in range(1, n+1)]), axis=1)
    return df

In [18]:
# top 30 neighbours for each user
sim_user_u = n_neighbours(similarity_with_user,5)
sim_user_u.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12345,12349,12347,12348,12345,12346
12346,12347,12349,12348,12346,12345
12347,12346,12345,12349,12348,12347
12348,12349,12348,12347,12346,12345
12349,12345,12349,12348,12347,12346


In [19]:
sim_user_m = n_neighbours(similarity_with_question,5)
sim_user_m.head()

Unnamed: 0_level_0,top1,top2,top3,top4,top5
userID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
12345,12347,12349,12348,12346,12345
12346,12348,12349,12347,12345,12346
12347,12348,12349,12345,12346,12347
12348,12349,12346,12347,12345,12348
12349,12348,12347,12346,12345,12349


In [20]:
def similar_questions(user1, user2):
    common_questions = history_s[history_s['userID'] == user1].merge(history_s[history_s['userID'] == user2], 
                                                             on = "questionID", how = "inner" )
    return common_questions.merge(questions, on = 'questionID' )

In [21]:
a = similar_questions(12345,12349)
a = a.loc[ : , ['visits_x_x','visits_x_y','Name']]
a.head()

Unnamed: 0,visits_x_x,visits_x_y,Name
0,0.0,0.0,How are the partner integrations licensed for ...
1,0.0,0.5,Is a support case required for Remote Technici...
2,0.0,0.0,Do I need to add firewall rules?
3,0.0,0.5,How is HPE OneView delivered?


In [22]:
def User_item_score(user,item):
    a = sim_user_m[sim_user_m.index==user].values
    b = a.squeeze().tolist()
    c = final_question.loc[:,item]
    d = c[c.index.isin(b)]
    f = d[d.notnull()]
    avg_user = userhistory.loc[userhistory['userID'] == user,'visits'].values[0]
    index = f.index.values.squeeze().tolist()
    corr = similarity_with_question.loc[user,index]
    fin = pd.concat([f, corr], axis=1)
    fin.columns = ['adg_score','correlation']
    fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
    nume = fin['score'].sum()
    deno = fin['correlation'].sum()
    final_score = avg_user + (nume/deno)
    return final_score

In [23]:
score = User_item_score(12349,19)
print("score (u,i) is",score)

score (u,i) is 0.25


In [24]:
history_s.userID = history_s.userID.astype(str)
history_s.questionID = history_s.questionID.astype(str)
question_user = history_s.groupby('userID')['questionID'].apply(lambda x:','.join(x))

In [25]:
question_user.index = question_user.index.astype(int)

In [26]:
def User_item_score1(user):
    question_by_user = temp.columns[temp[temp.index==user].notna().any()].tolist()
    a = sim_user_m[sim_user_m.index==user].values
    b = a.squeeze().tolist()
    d = question_user[question_user.index.isin(b)]
    l = ','.join(d.values)
    question_similar_users = l.split(',')
    questionslist = list(set(question_similar_users)-set(list(map(str, question_by_user))))
    questionslist = list(map(int, questionslist))
    score = []
    for item in questionslist:
        item = int(item)
        c = final_question.loc[:,item]
        d = c[c.index.isin(b)]
        f = d[d.notnull()]
        avg_user = userhistory.loc[userhistory['userID'] == user,'visits'].values[0]
        index = f.index.values.squeeze().tolist()
        corr = similarity_with_question.loc[user,index]
        fin = pd.concat([f, corr], axis=1)
        fin.columns = ['adg_score','correlation']
        fin['score']=fin.apply(lambda x:x['adg_score'] * x['correlation'],axis=1)
        nume = fin['score'].sum()
        deno = fin['correlation'].sum()
        final_score = avg_user + (nume/deno)
        score.append(final_score)
    data = pd.DataFrame({'questionID':questionslist,'score':score})
    recommendations = data.sort_values(by='score',ascending=False)
    questionname = recommendations.merge(questions, how='inner', on='questionID')
    return questionname.Name.values.tolist()

In [27]:
user = 12345
predicted_questions = User_item_score1(user)
print(" ")
print("The Recommendations for User Id : " + str(user))
print("   ")
for i in predicted_questions:
    print(i)

 
The Recommendations for User Id : 12345
   
What is HPE Composable Ecosystem Program? 
Does HPE OneView Global Dashboard support IPv6? 
How does HPE OneView Global Dashboard help customers manage at scale? 
How can I track the assignment of HPE OneView licensed across my data centers? 
Does Remote Technician require Remote Support? 
Does HPE OneView Global Dashboard display remote support status? 
Does HPE OneView Global Dashboard support interconnect modules? 
Are license upgrades available from Insight Management software to HPE OneView Advanced? 
How long will Hewlett Packard Enterprise support the current HPE SIM, product? 
Can I schedule an HPE OneView Global Dashboard report? 
How is HPE OneView Global Dashboard delivered?
Can I time-box access? Does it time out? 
What VMware plug-ins are in the HPE OneView Advanced 5.0 release? 
