In [2]:
%%time

import pandas as pd
import numpy as np
import re

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import Word2Vec
from multiprocessing import cpu_count
from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, SoftCosineSimilarity

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


CPU times: user 917 ms, sys: 246 ms, total: 1.16 s
Wall time: 1.58 s


In [3]:
users_repositories = pd.read_csv('../data/users__repositories.csv', dtype=str)
users_repositories.head()

Unnamed: 0,User,Repo_Member,mRepo_Language,Repo_Owner,oRepo_Language
0,tarruda,libmpack/libmpack,C,"tarruda/Algoritmos,tarruda/archdb,tarruda/back...","JavaScript,JavaScript,JavaScript,JavaScript,Py..."
1,mairatma,"deprecate/steel-avatar,deprecate/steel-cell-de...","CSS,CSS,HTML,CSS,HTML,CSS,JavaScript,CSS,JavaS...","mairatma/alloy-ui,mairatma/alloyui.com,mairatm...","JavaScript,JavaScript,TypeScript,JavaScript,Ja..."
2,joselitojunior1,"acmh/maecoruja,Cisneiros/projeto-anfa,demianbo...","JavaScript,Java,Java,HTML,JavaScript,None","joselitojunior1/abigobaldo-nunes-adventures,jo...","JavaScript,HTML,CSS,None,None,None,None,GCC Ma..."
3,marcelcaraciolo,"irgmedeiros/TCCRecommender,jg1141/Open-Allure-DS","Python,Python","marcelcaraciolo/apontador-api-libs,marcelcarac...","PHP,Python,Python,None,Python,Python,Python,No..."
4,luanfonceca,"andrezap/analise_expressao_genica,andrezap/arv...","Java,Java,Java,Ruby,Ruby,JavaScript,JavaScript...","luanfonceca/168horas,luanfonceca/4stoq,luanfon...","CSS,Ruby,Python,JavaScript,Python,Python,None,..."


In [4]:
repos_langs = pd.DataFrame({ 
    'User': users_repositories['User'],
    'Repos': (users_repositories['Repo_Member']+','+users_repositories['Repo_Owner']).fillna(''),
    'Languages': (users_repositories['mRepo_Language']+','+users_repositories['oRepo_Language']).fillna('')})
repos_langs.head()

Unnamed: 0,User,Repos,Languages
0,tarruda,"libmpack/libmpack,tarruda/Algoritmos,tarruda/a...","C,JavaScript,JavaScript,JavaScript,JavaScript,..."
1,mairatma,"deprecate/steel-avatar,deprecate/steel-cell-de...","CSS,CSS,HTML,CSS,HTML,CSS,JavaScript,CSS,JavaS..."
2,joselitojunior1,"acmh/maecoruja,Cisneiros/projeto-anfa,demianbo...","JavaScript,Java,Java,HTML,JavaScript,None,Java..."
3,marcelcaraciolo,"irgmedeiros/TCCRecommender,jg1141/Open-Allure-...","Python,Python,PHP,Python,Python,None,Python,Py..."
4,luanfonceca,"andrezap/analise_expressao_genica,andrezap/arv...","Java,Java,Java,Ruby,Ruby,JavaScript,JavaScript..."


In [5]:
repos = repos_langs['Repos']\
            .apply(lambda x: str(x).split(','))\
            .apply(lambda x: np.concatenate([r.split('/') for r in x]))
repos.head()

0    [libmpack, libmpack, tarruda, Algoritmos, tarr...
1    [deprecate, steel-avatar, deprecate, steel-cel...
2    [acmh, maecoruja, Cisneiros, projeto-anfa, dem...
3    [irgmedeiros, TCCRecommender, jg1141, Open-All...
4    [andrezap, analise_expressao_genica, andrezap,...
Name: Repos, dtype: object

In [6]:
repos[0][0:20]

array(['libmpack', 'libmpack', 'tarruda', 'Algoritmos', 'tarruda',
       'archdb', 'tarruda', 'backbone-deep-model', 'tarruda',
       'backbone-jquerymobile', 'tarruda', 'beets', 'tarruda',
       'better-sqlite3', 'tarruda', 'bootstrap-datetimepicker', 'tarruda',
       'brightray', 'tarruda', 'buffer-prefix-range'], dtype='<U32')

In [7]:
#  building langs dictionary

dictionary = Dictionary(repos)
tfidf = TfidfModel(dictionary=dictionary)

print("Number of unique words: %d" % len(dictionary))
print(dictionary)
# print(dictionary.token2id)

2018-06-25 10:59:23,784 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-25 10:59:23,834 : INFO : built Dictionary(12418 unique tokens: ['Algoritmos', 'CodeMirror', 'DefinitelyTyped', 'MaidSafe', 'MemoryModule']...) from 1992 documents (total 28472 corpus positions)


Number of unique words: 12418
Dictionary(12418 unique tokens: ['Algoritmos', 'CodeMirror', 'DefinitelyTyped', 'MaidSafe', 'MemoryModule']...)


In [8]:
new_doc = repos[4]
new_doc[1:20]

array(['analise_expressao_genica', 'andrezap', 'arvoresLP2', 'andrezap',
       'IlicitaApp', 'IgorMarques', 'SlackAndGemsCounter', 'IgorMarques',
       'StackOverflowUserChecker', 'maximiliano', 'curso-python',
       'peritoeconomista', 'mouraoadvocacia', 'peritoeconomista', 'site',
       'peritoeconomista', 'valorefetivo', 'Smartwork-Sebrae', 'web'],
      dtype='<U27')

In [9]:
# Transforming selected preference to vector (lang_idx, freq)
new_vec = dictionary.doc2bow(new_doc)
new_vec

[(348, 1),
 (369, 1),
 (370, 1),
 (371, 1),
 (372, 1),
 (373, 1),
 (374, 1),
 (375, 1),
 (376, 1),
 (377, 1),
 (378, 2),
 (379, 1),
 (380, 1),
 (381, 1),
 (382, 1),
 (383, 1),
 (384, 1),
 (385, 1),
 (386, 1),
 (387, 1),
 (388, 1),
 (389, 1),
 (390, 1),
 (391, 1),
 (392, 3),
 (393, 1),
 (394, 1),
 (395, 1),
 (396, 1),
 (397, 1),
 (398, 1),
 (399, 1),
 (400, 1),
 (401, 1),
 (402, 1),
 (403, 1),
 (404, 1),
 (405, 1),
 (406, 1),
 (407, 1),
 (408, 1),
 (409, 1),
 (410, 1),
 (411, 1),
 (412, 1),
 (413, 1),
 (414, 1),
 (415, 1),
 (416, 1),
 (417, 1),
 (418, 1),
 (419, 1),
 (420, 1),
 (421, 1),
 (422, 1),
 (423, 1),
 (424, 1),
 (425, 1),
 (426, 1),
 (427, 1),
 (428, 1),
 (429, 1),
 (430, 1),
 (431, 1),
 (432, 1),
 (433, 1),
 (434, 1),
 (435, 1),
 (436, 1),
 (437, 1),
 (438, 1),
 (439, 1),
 (440, 1),
 (441, 1),
 (442, 1),
 (443, 1),
 (444, 1),
 (445, 1),
 (446, 1),
 (447, 1),
 (448, 1),
 (449, 1),
 (450, 1),
 (451, 1),
 (452, 113),
 (453, 1),
 (454, 1),
 (455, 1),
 (456, 1),
 (457, 1),
 (458, 1

In [10]:
# Transforming languages to vector

bow_corpus  = [dictionary.doc2bow(text) for text in list(repos)]
bow_corpus[0:2]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 1),
  (30, 1),
  (31, 1),
  (32, 1),
  (33, 1),
  (34, 1),
  (35, 1),
  (36, 1),
  (37, 1),
  (38, 1),
  (39, 1),
  (40, 1),
  (41, 1),
  (42, 1),
  (43, 1),
  (44, 1),
  (45, 1),
  (46, 1),
  (47, 1),
  (48, 1),
  (49, 1),
  (50, 1),
  (51, 1),
  (52, 1),
  (53, 1),
  (54, 1),
  (55, 1),
  (56, 1),
  (57, 1),
  (58, 1),
  (59, 1),
  (60, 1),
  (61, 1),
  (62, 1),
  (63, 2),
  (64, 1),
  (65, 1),
  (66, 1),
  (67, 1),
  (68, 1),
  (69, 1),
  (70, 1),
  (71, 1),
  (72, 1),
  (73, 1),
  (74, 1),
  (75, 1),
  (76, 1),
  (77, 1),
  (78, 1),
  (79, 1),
  (80, 1),
  (81, 1),
  (82, 1),
  (83, 1),
  (84, 1),
  (85, 1),
  (86, 1),
  (87, 1),
  (88, 1),
  (89, 1),
  (90, 1),
  (91, 1)

In [12]:
index = SparseMatrixSimilarity(tfidf[bow_corpus], num_features=12418)

similarities = index[tfidf[new_vec]]
recomendations = list(enumerate(similarities))
# print(recs)

# LISTA DE TOP 10 USUARIOS RECOMENDADOS

top_recomendations = sorted(recomendations, key=lambda rating: rating[1], reverse=True)
print(top_recomendations[:10])

2018-06-25 11:00:29,827 : INFO : creating sparse index
2018-06-25 11:00:29,828 : INFO : creating sparse matrix from corpus
2018-06-25 11:00:29,830 : INFO : PROGRESS: at document #0
2018-06-25 11:00:29,938 : INFO : created <1992x12418 sparse matrix of type '<class 'numpy.float32'>'
	with 16362 stored elements in Compressed Sparse Row format>


[(4, 1.0000007), (9, 0.077480696), (1653, 0.0020082393), (519, 0.0011141422), (682, 0.00094680913), (66, 0.0007671181), (737, 0.000764372), (171, 0.0007139052), (665, 0.00063469785), (184, 0.0005777103)]


end model.