In [4]:
import pandas as pd
import scipy.sparse as sparse
import numpy as np
import random
import implicit
from sklearn.preprocessing import MinMaxScaler

In [5]:
articles_df = pd.read_csv('shared_articles.csv')
interactions_df = pd.read_csv('users_interactions.csv')


In [6]:
articles_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,authorPersonId,authorSessionId,authorUserAgent,authorRegion,authorCountry,contentType,url,title,text,lang
0,1459192779,CONTENT REMOVED,-6451309518266745024,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
1,1459193988,CONTENT SHARED,-4110354420726924665,4340306774493623681,8940341205206233829,,,,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en


In [7]:
interactions_df.head(2)

Unnamed: 0,timestamp,eventType,contentId,personId,sessionId,userAgent,userRegion,userCountry
0,1465413032,VIEW,-3499919498720038879,-8845298781299428018,1264196770339959068,,,
1,1465412560,VIEW,8890720798209849691,-1032019229384696495,3621737643587579081,Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_2...,NY,US


In [8]:
print( articles_df.shape )
print( interactions_df.shape )

(3122, 13)
(72312, 8)


# Dropping unwanted columns 

In [9]:
articles_df.drop(['authorUserAgent', 'authorRegion', 'authorCountry'], axis=1, inplace=True)
interactions_df.drop(['userAgent', 'userRegion', 'userCountry'], axis=1, inplace=True)

In [10]:
articles_df['eventType'].value_counts()

CONTENT SHARED     3047
CONTENT REMOVED      75
Name: eventType, dtype: int64

In [11]:
articles_df = articles_df[articles_df['eventType'] == 'CONTENT SHARED']
articles_df.drop('eventType', axis=1, inplace=True)

In [12]:
 articles_df.head(2)

Unnamed: 0,timestamp,contentId,authorPersonId,authorSessionId,contentType,url,title,text,lang
1,1459193988,-4110354420726924665,4340306774493623681,8940341205206233829,HTML,http://www.nytimes.com/2016/03/28/business/dea...,"Ethereum, a Virtual Currency, Enables Transact...",All of this work is still very early. The firs...,en
2,1459194146,-7292285110016212249,4340306774493623681,8940341205206233829,HTML,http://cointelegraph.com/news/bitcoin-future-w...,Bitcoin Future: When GBPcoin of Branson Wins O...,The alarm clock wakes me at 8:00 with stream o...,en


In [13]:
df = pd.merge(interactions_df[['contentId','personId', 'eventType']], articles_df[['contentId', 'title']], how = 'inner', on = 'contentId')

In [15]:
df.shape

(72269, 4)

In [18]:
df.drop_duplicates(inplace = True )

In [19]:
df.shape

(50910, 4)

In [20]:
df.head(5)

Unnamed: 0,contentId,personId,eventType,title
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem
6,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem
8,-3499919498720038879,-9009798162809551896,LIKE,Hiri wants to fix the workplace email problem


In [21]:
df['eventType'].value_counts()

VIEW               40213
LIKE                5670
BOOKMARK            2202
COMMENT CREATED     1427
FOLLOW              1398
Name: eventType, dtype: int64

In [22]:
event_type_strength = {
   'VIEW': 1.0,
   'LIKE': 2.0, 
   'BOOKMARK': 3.0, 
   'FOLLOW': 4.0,
   'COMMENT CREATED': 5.0,  
}

df['eventStrength'] = df['eventType'].apply(lambda x: event_type_strength[x])
df.head()

Unnamed: 0,contentId,personId,eventType,title,eventStrength
0,-3499919498720038879,-8845298781299428018,VIEW,Hiri wants to fix the workplace email problem,1.0
2,-3499919498720038879,-108842214936804958,VIEW,Hiri wants to fix the workplace email problem,1.0
3,-3499919498720038879,-1443636648652872475,VIEW,Hiri wants to fix the workplace email problem,1.0
6,-3499919498720038879,-8020832670974472349,VIEW,Hiri wants to fix the workplace email problem,1.0
8,-3499919498720038879,-9009798162809551896,LIKE,Hiri wants to fix the workplace email problem,2.0


In [26]:
grouped_df = df.groupby(['personId', 'contentId', 'title']).sum().reset_index()

grouped_df['title'] = grouped_df['title'].astype("category")
grouped_df['personId'] = grouped_df['personId'].astype("category")
grouped_df['contentId'] = grouped_df['contentId'].astype("category")
grouped_df['person_id'] = grouped_df['personId'].cat.codes
grouped_df['content_id'] = grouped_df['contentId'].cat.codes

grouped_df.sample(10)

Unnamed: 0,personId,contentId,title,eventStrength,person_id,content_id
13240,-2726721797588771398,8890720798209849691,Top 10 Intranet Trends of 2016,1.0,675,2925
6000,-6395289021795478412,-908052164352446106,IoT a favor do relacionamento médico-paciente,1.0,278,1325
20129,-108842214936804958,3172866488852888544,Google Is Finally Redesigning Its Biggest Cash...,1.0,926,1985
39586,8698209668239173600,3058137260516373249,Now we know how profitable Microsoft's cloud b...,1.0,1833,1962
26743,2833428826475063405,-2447632164766022033,O que o GitHub tem a nos dizer sobre os estere...,1.0,1235,1075
537,-9016528795238256703,7507067965574797372,Um bilhão de arquivos mostram quem vence a dis...,1.0,17,2702
33371,5127372011815639401,-3675009040647936899,Ubuntu recebe suporte ao React Native e aplica...,3.0,1466,903
32634,4531872167062142431,3575694866055127253,Drupal 8.1.0 is now available,1.0,1414,2051
9527,-4160173091318455989,-8099145207792678586,TechCrunch launches a personalized news recomm...,1.0,519,208
24519,1811781596621121379,-2405892683303291957,"""5G está para IoT como 4G esteve para o Smartp...",1.0,1135,1084


In [37]:
sparse_content_person = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['content_id'], grouped_df['person_id'])))
sparse_person_content = sparse.csr_matrix((grouped_df['eventStrength'].astype(float), (grouped_df['person_id'], grouped_df['content_id'])))

In [39]:
model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)



In [40]:
alpha = 15
data = (sparse_content_person * alpha).astype('double')

# Fit the model
model.fit(data)

100%|████████████████████████████████████████████████████████████████████████████████| 50.0/50 [00:01<00:00, 30.24it/s]


In [46]:
content_id = 450
n_similar = 10

person_vecs = model.user_factors
content_vecs = model.item_factors

content_norms = np.sqrt((content_vecs * content_vecs).sum(axis=1))

scores = content_vecs.dot(content_vecs[content_id]) / content_norms
top_idx = np.argpartition(scores, -n_similar)[-n_similar:]
similar = sorted(zip(top_idx, scores[top_idx] / content_norms[content_id]), key=lambda x: -x[1])

In [48]:
for content in similar:
    idx, score = content
    print(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])

Google's fair use victory is good for open source
Up your DevOps chops with this online Kubernetes class
Google lags behind Amazon and Microsoft's cloud in one important area
Google's Cloud Dataflow stomps on Apache Spark in new benchmark tests
Building immutable entities into Google Cloud Datastore
Deep learning software knows that a rose is a rose is a rosa rubiginosa
Inside OpenAI, Elon Musk's Wild Plan to Set Artificial Intelligence Free
Automate deployments and traffic splitting with the App Engine Admin API
Artificial Intelligence's White Guy Problem
An independent organization just ranked Google as the best cloud, beating Amazon


In [50]:
def recommend(person_id, sparse_person_content, person_vecs, content_vecs, num_contents=10):
    # Get the interactions scores from the sparse person content matrix
    person_interactions = sparse_person_content[person_id,:].toarray()
    # Add 1 to everything, so that articles with no interaction yet become equal to 1
    person_interactions = person_interactions.reshape(-1) + 1
    # Make articles already interacted zero
    person_interactions[person_interactions > 1] = 0
    # Get dot product of person vector and all content vectors
    rec_vector = person_vecs[person_id,:].dot(content_vecs.T).toarray()
    
    # Scale this recommendation vector between 0 and 1
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0]
    # Content already interacted have their recommendation multiplied by zero
    recommend_vector = person_interactions * rec_vector_scaled
    # Sort the indices of the content into order of best recommendations
    content_idx = np.argsort(recommend_vector)[::-1][:num_contents]
    
    # Start empty list to store titles and scores
    titles = []
    scores = []

    for idx in content_idx:
        # Append titles and scores to the list
        titles.append(grouped_df.title.loc[grouped_df.content_id == idx].iloc[0])
        scores.append(recommend_vector[idx])

    recommendations = pd.DataFrame({'title': titles, 'score': scores})

    return recommendations

In [51]:
#  Get the trained person and content vectors. We convert them to csr matrices
person_vecs = sparse.csr_matrix(model.user_factors)
content_vecs = sparse.csr_matrix(model.item_factors)

# Create recommendations for person with id 50
person_id = 50

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0  Custo do Erro - Cinco motivos para investir em...  1.000000
1  Former Google career coach shares a visual tri...  0.970806
2  Ray Kurzweil: The world isn't getting worse - ...  0.835686
3       10 Modern Software Over-Engineering Mistakes  0.829900
4  Do You Suffer From Deployment Anxiety? - DZone...  0.798516
5  Novo workaholic trabalha, pratica esportes e t...  0.792261
6           Drupal and ambitious digital experiences  0.791679
7                       Discutindo Devops na Prática  0.767872
8  Psicóloga de Harvard diz que as pessoas julgam...  0.750034
9               The technology behind preview photos  0.743581


In [52]:
grouped_df.loc[grouped_df['person_id'] == 50].sort_values(by=['eventStrength'], ascending=False)[['title', 'person_id', 'eventStrength']].head(10)

Unnamed: 0,title,person_id,eventStrength
1727,Acquia Engage 2016: Day One,50,3.0
1791,Um bilhão de arquivos mostram quem vence a dis...,50,3.0
1781,Acquia Engage Awards Finalists Announced,50,3.0
1778,Sharing innovation with your competitors - Dri...,50,3.0
1769,Don't document your code. Code your documentat...,50,3.0
1747,Who sponsors Drupal development? | Dries Buytaert,50,3.0
1768,Johnson & Johnson comprará grupo suíço por US$...,50,1.0
1767,Slack and Google announce partnership focused ...,50,1.0
1770,Rating the English Proficiency of Countries an...,50,1.0
1766,Infográfico: Algoritmos para Aprendizado de Má...,50,1.0


In [53]:
person_id = 2

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0                   Livro: Retrospectivas Divertidas  0.984075
1          Google Ranking Factors: The Complete List  0.841709
2  Novo workaholic trabalha, pratica esportes e t...  0.823566
3           Drupal and ambitious digital experiences  0.779686
4  ITA está oferecendo 10 cursos gratuitos a dist...  0.761514
5  How to Improve 8 Major Problem Areas for Japan...  0.755990
6                    40 Basic Japanese conversations  0.692540
7  Uber China will reportedly merge with archriva...  0.691871
8  Psicóloga de Harvard diz que as pessoas julgam...  0.685263
9                               Japanese for dummies  0.659207


In [54]:
grouped_df.loc[grouped_df['person_id'] == 2].sort_values(by=['eventStrength'], ascending=False)[['title', 'eventStrength', 'person_id']]

Unnamed: 0,title,eventStrength,person_id
51,Former Google career coach shares a visual tri...,6.0,2
48,Request lesson : How and when to use はず(=hazu)...,3.0,2
49,Aposta na inovação,3.0,2
50,"The Algorithm March, Japan's Strangely Enterta...",3.0,2
54,Como são escrita as risadas em japonês? - Suki...,3.0,2
52,A minha viagem à Maternidade #tetodomundo,1.0,2
53,Learn Hiragana: The Ultimate Guide,1.0,2


In [55]:

person_id = 1

recommendations = recommend(person_id, sparse_person_content, person_vecs, content_vecs)

print(recommendations)

                                               title     score
0  Como são escrita as risadas em japonês? - Suki...  0.869312
1  'The Simpsons' celebrates 600 episodes with a ...  0.780623
2  Ganhe 6 meses de acesso ao Pluralsight, maior ...  0.773376
3  Request lesson : How and when to use はず(=hazu)...  0.692939
4                               Jenkins 2.0 is here!  0.691413
5  Carteira inteligente tem trava biométrica e co...  0.688973
6                    Novidades do Android Studio 2.2  0.667798
7           Programação Reativa Funcional com RxJava  0.667180
8  Comercial japonês cria cidade de papel com can...  0.660740
9          Speeding up ReSharper (and Visual Studio)  0.654841


In [56]:
grouped_df.loc[grouped_df['person_id'] == 1].sort_values(by=['eventStrength'], ascending=False)[['title', 'eventStrength', 'person_id']]

Unnamed: 0,title,eventStrength,person_id
44,Learn Hiragana: The Ultimate Guide,3.0,1
43,Firebase Test Lab for Android,1.0,1
45,"Fresco, sim! - Android Dev BR",1.0,1
46,Japanese for dummies,1.0,1
47,Firebase and Google Cloud: better together,1.0,1
