In [111]:
%%time

import pandas as pd
import numpy as np
import re

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import Word2Vec
from multiprocessing import cpu_count
from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, SoftCosineSimilarity

# import logging
# logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


CPU times: user 22 µs, sys: 1 µs, total: 23 µs
Wall time: 26.2 µs


In [112]:
users_repositories = pd.read_csv('../data/users__repositories.csv', dtype=str)
users_repositories.head()

Unnamed: 0,User,Repo_Member,mRepo_Language,Repo_Owner,oRepo_Language
0,tarruda,libmpack/libmpack,C,"tarruda/Algoritmos,tarruda/archdb,tarruda/back...","JavaScript,JavaScript,JavaScript,JavaScript,Py..."
1,mairatma,"deprecate/steel-avatar,deprecate/steel-cell-de...","CSS,CSS,HTML,CSS,HTML,CSS,JavaScript,CSS,JavaS...","mairatma/alloy-ui,mairatma/alloyui.com,mairatm...","JavaScript,JavaScript,TypeScript,JavaScript,Ja..."
2,joselitojunior1,"acmh/maecoruja,Cisneiros/projeto-anfa,demianbo...","JavaScript,Java,Java,HTML,JavaScript,None","joselitojunior1/abigobaldo-nunes-adventures,jo...","JavaScript,HTML,CSS,None,None,None,None,GCC Ma..."
3,marcelcaraciolo,"irgmedeiros/TCCRecommender,jg1141/Open-Allure-DS","Python,Python","marcelcaraciolo/apontador-api-libs,marcelcarac...","PHP,Python,Python,None,Python,Python,Python,No..."
4,luanfonceca,"andrezap/analise_expressao_genica,andrezap/arv...","Java,Java,Java,Ruby,Ruby,JavaScript,JavaScript...","luanfonceca/168horas,luanfonceca/4stoq,luanfon...","CSS,Ruby,Python,JavaScript,Python,Python,None,..."


In [113]:
users_repositories.describe()

Unnamed: 0,User,Repo_Member,mRepo_Language,Repo_Owner,oRepo_Language
count,1992,785,785,1630,1630.0
unique,1981,764,413,1622,1207.0
top,diogo-lins,"daidson/MonitoriaInformatica2017s2,jtdsjossany...",Java,"palaciowagner/AlepeDigital,palaciowagner/react...",
freq,2,3,78,2,132.0


In [114]:
# Reindexing

repos_langs = pd.DataFrame({ 
    'User': users_repositories['User'],
    'Repos': (users_repositories['Repo_Member']+','+users_repositories['Repo_Owner']).fillna(''),
    'Languages': (users_repositories['mRepo_Language']+','+users_repositories['oRepo_Language']).fillna('')})
repos_langs.head()

Unnamed: 0,User,Repos,Languages
0,tarruda,"libmpack/libmpack,tarruda/Algoritmos,tarruda/a...","C,JavaScript,JavaScript,JavaScript,JavaScript,..."
1,mairatma,"deprecate/steel-avatar,deprecate/steel-cell-de...","CSS,CSS,HTML,CSS,HTML,CSS,JavaScript,CSS,JavaS..."
2,joselitojunior1,"acmh/maecoruja,Cisneiros/projeto-anfa,demianbo...","JavaScript,Java,Java,HTML,JavaScript,None,Java..."
3,marcelcaraciolo,"irgmedeiros/TCCRecommender,jg1141/Open-Allure-...","Python,Python,PHP,Python,Python,None,Python,Py..."
4,luanfonceca,"andrezap/analise_expressao_genica,andrezap/arv...","Java,Java,Java,Ruby,Ruby,JavaScript,JavaScript..."


# Construindo o dicionario: frequencia das palavras

In [115]:
# Selecao de caracteristicas : vetor de linguagens
languages = repos_langs['Languages'].apply(lambda x: str(x).split(','))
languages.head()

0    [C, JavaScript, JavaScript, JavaScript, JavaSc...
1    [CSS, CSS, HTML, CSS, HTML, CSS, JavaScript, C...
2    [JavaScript, Java, Java, HTML, JavaScript, Non...
3    [Python, Python, PHP, Python, Python, None, Py...
4    [Java, Java, Java, Ruby, Ruby, JavaScript, Jav...
Name: Languages, dtype: object

In [116]:
#  building dictionary based on languages dataset
dictionary = Dictionary(languages)

print("Number of unique words: %d" % len(dictionary))
print(dictionary)
# print(dictionary.token2id)

2018-06-25 13:32:11,270 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-25 13:32:11,286 : INFO : built Dictionary(102 unique tokens: ['C', 'C++', 'CMake', 'CoffeeScript', 'HTML']...) from 1992 documents (total 14858 corpus positions)


Number of unique words: 102
Dictionary(102 unique tokens: ['C', 'C++', 'CMake', 'CoffeeScript', 'HTML']...)


In [117]:
# Transforming dataset values = languages to freq vectors
# vetor de frequencia de palavras
bow_corpus = [dictionary.doc2bow(text) for text in list(languages)] #frequency vector for all languages dataset

bow_corpus_df = pd.DataFrame(bow_corpus)

print(bow_corpus_df.describe())


             0        1        2        3        4        5        6   \
count      1992      695      612      509      417      331      253   
unique       82      110      141      145      120      128      106   
top     (30, 1)  (10, 1)  (10, 1)  (10, 1)  (17, 1)  (17, 1)  (17, 1)   
freq       1244       51       49       34       32       29       17   

             7        8        9        10       11       12       13  \
count       180      127       87       66       44       27       15   
unique       91       80       56       46       32       20       14   
top     (15, 1)  (17, 1)  (19, 1)  (35, 1)  (32, 1)  (41, 1)  (28, 1)   
freq          8        8        6        6        4        4        2   

             14       15       16       17       18       19  
count        10        8        6        4        2        1  
unique       10        8        6        4        2        1  
top     (62, 1)  (49, 4)  (42, 1)  (68, 2)  (92, 1)  (53, 2)  
freq          1 

## Model para estimativa de similaridade dos n items: ranking

In [118]:
#  Traing Model for predicions: tf-idf

tfidf = TfidfModel(dictionary=dictionary)

In [119]:
# Transforming novo exemplo to vector (lang_idx, freq)

new_doc = ['JavaScript','JavaScript','Python','JavaScript','JavaScript','C++','JavaScript','Lua','C'] # 
new_vec = dictionary.doc2bow(new_doc) # frequency vector
new_vec

[(0, 1), (1, 1), (7, 5), (8, 1), (12, 1)]

In [120]:
# ranking dos termos presentes em new doc
print(tfidf[new_vec]) # vector score

[(0, 0.2544226420392753), (1, 0.2679520076775512), (7, 0.7610542410669464), (8, 0.4989679145224941), (12, 0.18787842672965765)]


In [121]:
# Caso demore
# bow_corpus_df.to_csv('../data/language_vectors.csv')

## Ranking: Matriz de similaridades usando similaridade de Cosenos

In [122]:
# Tecnica: Calculo de similaridade com Cosin

index = SparseMatrixSimilarity(tfidf[bow_corpus], num_features=102)

similarities = index[tfidf[new_vec]]
recomendations = list(enumerate(similarities))
# print(recs)

2018-06-25 13:32:26,462 : INFO : creating sparse index
2018-06-25 13:32:26,463 : INFO : creating sparse matrix from corpus
2018-06-25 13:32:26,464 : INFO : PROGRESS: at document #0
2018-06-25 13:32:26,521 : INFO : created <1992x102 sparse matrix of type '<class 'numpy.float32'>'
	with 5386 stored elements in Compressed Sparse Row format>


In [123]:
# LISTA DE TOP 10 USUARIOS RECOMENDADOS cosin

top_recs = sorted(recomendations, key=lambda rating: rating[1], reverse=True)
print(top_recs[:10])

[(638, 0.87646866), (1075, 0.8125202), (0, 0.8097467), (580, 0.7825455), (197, 0.78091174), (992, 0.76595163), (133, 0.7610542), (1423, 0.7610542), (1793, 0.7610542), (160, 0.75295174)]


In [124]:
# 
# Algoritmo: Calculo de similaridade com SofCosin

w2v_model = Word2Vec(languages, workers=cpu_count(), size=102, seed=12345)
similarity_matrix = w2v_model.wv.similarity_matrix(dictionary, tfidf, nonzero_limit=100)

index = SoftCosineSimilarity( tfidf[bow_corpus], similarity_matrix)
similarities = index[tfidf[new_vec]]

recs_soft = list(enumerate(similarities))
# print(recs_soft)

top_recs_soft = sorted(recs_soft, key=lambda rating: rating[1], reverse=True)
print(top_recs_soft[:10])

2018-06-25 13:32:27,648 : INFO : collecting all words and their counts
2018-06-25 13:32:27,648 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-25 13:32:27,651 : INFO : collected 102 word types from a corpus of 14858 raw words and 1992 sentences
2018-06-25 13:32:27,652 : INFO : Loading a fresh vocabulary
2018-06-25 13:32:27,653 : INFO : min_count=5 retains 55 unique words (53% of original 102, drops 47)
2018-06-25 13:32:27,653 : INFO : min_count=5 leaves 14770 word corpus (99% of original 14858, drops 88)
2018-06-25 13:32:27,654 : INFO : deleting the raw counts dictionary of 102 items
2018-06-25 13:32:27,654 : INFO : sample=0.001 downsamples 27 most-common words
2018-06-25 13:32:27,655 : INFO : downsampling leaves estimated 2747 word corpus (18.6% of prior 14770)
2018-06-25 13:32:27,655 : INFO : estimated required memory for 55 words and 102 dimensions: 72380 bytes
2018-06-25 13:32:27,656 : INFO : resetting layer weights
2018-06-25 13:32:27,658 : INFO 

[(638, 0.9998536365774648), (457, 0.9997082420005873), (146, 0.9996820152887094), (1075, 0.9994663061300215), (197, 0.9992345196987942), (138, 0.9991041969615647), (160, 0.9988492672472518), (105, 0.9987969803907915), (665, 0.9987202075932055), (0, 0.9986966613826829)]


#End.

# Avaliação