In [20]:
%%time

import pandas as pd
import numpy as np
import re

from gensim.corpora import Dictionary
from gensim.models import TfidfModel
from gensim.models import Word2Vec
from multiprocessing import cpu_count
from gensim.similarities import MatrixSimilarity, SparseMatrixSimilarity, SoftCosineSimilarity

import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


In [19]:
users_repositories = pd.read_csv('../data/users__repositories.csv', dtype=str)
users_repositories.head()

Unnamed: 0,User,Repo_Member,mRepo_Language,Repo_Owner,oRepo_Language
0,tarruda,libmpack/libmpack,C,"tarruda/Algoritmos,tarruda/archdb,tarruda/back...","JavaScript,JavaScript,JavaScript,JavaScript,Py..."
1,mairatma,"deprecate/steel-avatar,deprecate/steel-cell-de...","CSS,CSS,HTML,CSS,HTML,CSS,JavaScript,CSS,JavaS...","mairatma/alloy-ui,mairatma/alloyui.com,mairatm...","JavaScript,JavaScript,TypeScript,JavaScript,Ja..."
2,joselitojunior1,"acmh/maecoruja,Cisneiros/projeto-anfa,demianbo...","JavaScript,Java,Java,HTML,JavaScript,None","joselitojunior1/abigobaldo-nunes-adventures,jo...","JavaScript,HTML,CSS,None,None,None,None,GCC Ma..."
3,marcelcaraciolo,"irgmedeiros/TCCRecommender,jg1141/Open-Allure-DS","Python,Python","marcelcaraciolo/apontador-api-libs,marcelcarac...","PHP,Python,Python,None,Python,Python,Python,No..."
4,luanfonceca,"andrezap/analise_expressao_genica,andrezap/arv...","Java,Java,Java,Ruby,Ruby,JavaScript,JavaScript...","luanfonceca/168horas,luanfonceca/4stoq,luanfon...","CSS,Ruby,Python,JavaScript,Python,Python,None,..."


In [5]:
users_repositories.describe()

Unnamed: 0,User,Repo_Member,mRepo_Language,Repo_Owner,oRepo_Language
count,1992,785,785,1630,1630.0
unique,1981,764,413,1622,1207.0
top,diogo-lins,"daidson/MonitoriaInformatica2017s2,jtdsjossany...",Java,"palaciowagner/AlepeDigital,palaciowagner/react...",
freq,2,3,78,2,132.0


In [6]:
# Factorization >>> q, r = da.linalg.qr(x)  

# Construindo o dicionario

In [30]:

repos_langs = pd.DataFrame({ 
    'User': users_repositories['User'],
    'Repos': (users_repositories['Repo_Member']+','+users_repositories['Repo_Owner']).fillna(''),
    'Languages': (users_repositories['mRepo_Language']+','+users_repositories['oRepo_Language']).fillna('')})
repos_langs.head()

Unnamed: 0,User,Repos,Languages
0,tarruda,"libmpack/libmpack,tarruda/Algoritmos,tarruda/a...","C,JavaScript,JavaScript,JavaScript,JavaScript,..."
1,mairatma,"deprecate/steel-avatar,deprecate/steel-cell-de...","CSS,CSS,HTML,CSS,HTML,CSS,JavaScript,CSS,JavaS..."
2,joselitojunior1,"acmh/maecoruja,Cisneiros/projeto-anfa,demianbo...","JavaScript,Java,Java,HTML,JavaScript,None,Java..."
3,marcelcaraciolo,"irgmedeiros/TCCRecommender,jg1141/Open-Allure-...","Python,Python,PHP,Python,Python,None,Python,Py..."
4,luanfonceca,"andrezap/analise_expressao_genica,andrezap/arv...","Java,Java,Java,Ruby,Ruby,JavaScript,JavaScript..."


In [36]:
# Selecao de caracteristicas : vetor de linguagens
languages = repos_langs['Languages'].apply(lambda x: str(x).split(','))
# languages.value_counts()?
languages.head()


0    [C, JavaScript, JavaScript, JavaScript, JavaSc...
1    [CSS, CSS, HTML, CSS, HTML, CSS, JavaScript, C...
2    [JavaScript, Java, Java, HTML, JavaScript, Non...
3    [Python, Python, PHP, Python, Python, None, Py...
4    [Java, Java, Java, Ruby, Ruby, JavaScript, Jav...
Name: Languages, dtype: object

In [38]:
#  building langs dictionary

dictionary = Dictionary(languages)
tfidf = TfidfModel(dictionary=dictionary)

print("Number of unique words: %d" % len(dictionary))
print(dictionary)
# print(dictionary.token2id)

2018-06-25 11:03:42,166 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-06-25 11:03:42,183 : INFO : built Dictionary(102 unique tokens: ['C', 'C++', 'CMake', 'CoffeeScript', 'HTML']...) from 1992 documents (total 14858 corpus positions)


Number of unique words: 102
Dictionary(102 unique tokens: ['C', 'C++', 'CMake', 'CoffeeScript', 'HTML']...)


# Ranking: Languages score vector

In [39]:
# Transforming selected preference to vector (lang_idx, freq)

new_doc = ['JavaScript','JavaScript','Python','JavaScript','JavaScript','C++','JavaScript','Lua','C']
new_vec = dictionary.doc2bow(new_doc)
new_vec

[(0, 1), (1, 1), (7, 5), (8, 1), (12, 1)]

In [40]:
tfidf[new_vec] # vector score

[(0, 0.2544226420392753),
 (1, 0.2679520076775512),
 (7, 0.7610542410669464),
 (8, 0.4989679145224941),
 (12, 0.18787842672965765)]

In [41]:
# Transforming languages to vector

bow_corpus  = [dictionary.doc2bow(text) for text in list(languages)]
bow_corpus[0:2]

[[(0, 11),
  (1, 13),
  (2, 1),
  (3, 13),
  (4, 2),
  (5, 1),
  (6, 3),
  (7, 67),
  (8, 5),
  (9, 1),
  (10, 8),
  (11, 1),
  (12, 14),
  (13, 3),
  (14, 4),
  (15, 2),
  (16, 8)],
 [(4, 7), (6, 1), (7, 81), (10, 4), (15, 2), (17, 6)]]

In [42]:
bow_corpus_df = pd.DataFrame(bow_corpus)
len(bow_corpus_df)

1992

In [33]:
# Caso demore
# bow_corpus_df.to_csv('../data/language_vectors.csv')

## Matriz de similaridades de linguagens entre usuarios

In [44]:

index = SparseMatrixSimilarity(tfidf[bow_corpus], num_features=102)

sims = index[tfidf[new_vec]]
recs = list(enumerate(sims))
# print(recs)

# LISTA DE TOP 10 USUARIOS RECOMENDADOS

top_recs = sorted(recs, key=lambda rating: rating[1], reverse=True)
print(top_recs[:10])

2018-06-25 11:03:59,582 : INFO : creating sparse index
2018-06-25 11:03:59,583 : INFO : creating sparse matrix from corpus
2018-06-25 11:03:59,584 : INFO : PROGRESS: at document #0
2018-06-25 11:03:59,637 : INFO : created <1992x102 sparse matrix of type '<class 'numpy.float32'>'
	with 5386 stored elements in Compressed Sparse Row format>


[(638, 0.87646866), (1075, 0.8125202), (0, 0.8097467), (580, 0.7825455), (197, 0.78091174), (992, 0.76595163), (133, 0.7610542), (1423, 0.7610542), (1793, 0.7610542), (160, 0.75295174)]


In [45]:
# def softcossim(query, corpus):
#     # Compute Soft Cosine Measure between the query and the documents.
#     index = SoftCosineSimilarity(
#         tfidf[corpus],
#         similarity_matrix)
#     similarities = index[tfidf[query]]
#     return similarities
w2v_model = Word2Vec(languages, workers=cpu_count(), size=102, seed=12345)
similarity_matrix = w2v_model.wv.similarity_matrix(dictionary, tfidf, nonzero_limit=100)

index = SoftCosineSimilarity( tfidf[bow_corpus], similarity_matrix)
similarities = index[tfidf[new_vec]]

recs_soft = list(enumerate(similarities))
# print(recs_soft)

top_recs_soft = sorted(recs_soft, key=lambda rating: rating[1], reverse=True)
print(top_recs_soft[:10])

2018-06-25 11:04:16,448 : INFO : collecting all words and their counts
2018-06-25 11:04:16,448 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-25 11:04:16,451 : INFO : collected 102 word types from a corpus of 14858 raw words and 1992 sentences
2018-06-25 11:04:16,452 : INFO : Loading a fresh vocabulary
2018-06-25 11:04:16,452 : INFO : min_count=5 retains 55 unique words (53% of original 102, drops 47)
2018-06-25 11:04:16,454 : INFO : min_count=5 leaves 14770 word corpus (99% of original 14858, drops 88)
2018-06-25 11:04:16,455 : INFO : deleting the raw counts dictionary of 102 items
2018-06-25 11:04:16,456 : INFO : sample=0.001 downsamples 27 most-common words
2018-06-25 11:04:16,456 : INFO : downsampling leaves estimated 2747 word corpus (18.6% of prior 14770)
2018-06-25 11:04:16,457 : INFO : estimated required memory for 55 words and 102 dimensions: 72380 bytes
2018-06-25 11:04:16,458 : INFO : resetting layer weights
2018-06-25 11:04:16,461 : INFO 

[(638, 0.9998354298827233), (146, 0.9996777832884293), (457, 0.9996682130175113), (1075, 0.9994130969745207), (197, 0.9992301040747679), (138, 0.9990556509272054), (105, 0.9987591795922601), (665, 0.9986698836566502), (0, 0.9986355714353805), (160, 0.9986295022923322)]


#End.