In [1]:
import requests, json

def searchRepos(search):
    req = "https://api.github.com/search/repositories?q=" + search
    res = requests.get(req).json()
    return res['items']

def getModules(string):
    req = string.replace("github", "raw.githubusercontent") + '/master/package.json'
    res = requests.get(req).json()
    modules = list(res['devDependencies'].keys())
    return modules
    
    
def parseInfo(repoList):
    descModulePairings = []
    for i in range(len(repoList)):
        try:
            descModulePairings.append([repoList[i]['description'], getModules(repoList[i]['html_url'])])
        except:
            pass
    return descModulePairings

In [2]:
repoList = searchRepos('language:js')
descModulePairings = parseInfo(repoList)

In [3]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.stem import SnowballStemmer
stop_words = set(stopwords.words('english'))
tokenizer = RegexpTokenizer(r'\w+')
stemmer = SnowballStemmer('english')
from langdetect import detect
import copy

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hayden/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
def cleanData(pairings):
    newPairings = []
    for i in range(len(pairings)):
        if detect(pairings[i][0]) != 'en':
            pass
        pairings[i][0] = tokenizer.tokenize(pairings[i][0].lower())
        pairings[i][0] = [stemmer.stem(word) for word in pairings[i][0] if (word not in stop_words)]
        newPairings.append(pairings[i])
    return(newPairings)

In [5]:
tempPairs = copy.deepcopy(descModulePairings)
pairings = cleanData(tempPairs)

In [6]:
def restructure(pairings):
    data = {}
    for i in range(len(pairings)):
        for module in pairings[i][1]:
            if module in data.keys():
                data[module] += pairings[i][0]
            else:
                data[module] = pairings[i][0]
    for key in data:
        data[key] = list(dict.fromkeys(data[key]))
    return(data)

In [7]:
moduleDict = restructure(pairings)

In [8]:
from collections import Counter

def removeCommonWords(moduleDict):
    allWords = []
    for key in moduleDict:
        for word in moduleDict[key]:
            allWords.append(word)
    count = Counter(allWords)
    mostCommon = count.most_common(int(len(count)*.25))
    mostCommonList = [mostCommon[i][0] for i in range(len(mostCommon))]
    for key in moduleDict:
        for word in moduleDict[key]:
            if word in mostCommonList:
                moduleDict[key].remove(word)
        moduleDict[key] = " ".join(moduleDict[key])
    return moduleDict, mostCommonList

newModuleDict, mostCommonList = removeCommonWords(moduleDict)

In [9]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [10]:
df = pd.DataFrame.from_dict(moduleDict, orient='index')
df.columns = ['bagOfWords']
df.head()

Unnamed: 0,bagOfWords
@freecodecamp/eslint-config,freecodecamp org open sourc codebas curriculum...
babel-eslint,freecodecamp org open sourc codebas curriculum...
cross-env,freecodecamp org open sourc codebas curriculum...
debug,freecodecamp org open sourc codebas curriculum...
docsify-cli,freecodecamp org open sourc codebas curriculum...


In [11]:
count = CountVectorizer()
count_matrix = count.fit_transform(df['bagOfWords'])
cosine_sim = cosine_similarity(count_matrix, count_matrix)
print(cosine_sim)

[[1.         1.         1.         ... 0.23791548 0.23791548 0.23791548]
 [1.         1.         1.         ... 0.23791548 0.23791548 0.23791548]
 [1.         1.         1.         ... 0.23791548 0.23791548 0.23791548]
 ...
 [0.23791548 0.23791548 0.23791548 ... 1.         1.         1.        ]
 [0.23791548 0.23791548 0.23791548 ... 1.         1.         1.        ]
 [0.23791548 0.23791548 0.23791548 ... 1.         1.         1.        ]]


In [12]:
def cleanUp(string):
    words = tokenizer.tokenize(string.lower())
    stemmedWords = [stemmer.stem(word) for word in words if (word not in stop_words)]
    for word in stemmedWords:
        if word in mostCommonList:
            stemmedWords.remove(word)
    return " ".join(stemmedWords)


In [13]:
def recommendSimilarModules(title, amount, cosine_sim = cosine_sim):
    indices = pd.Series(df.index)
    recommended_modules = []
    i = indices[indices == title].index[0]
    score_series = pd.Series(cosine_sim[i]).sort_values(ascending = False)
    top_indices = list(score_series.iloc[1:amount].index)
    for j in top_indices:
        recommended_modules.append(list(df.index)[j])
    return recommended_modules

In [14]:
def recommendModules(description, amount, cosine_sim = cosine_sim):
    cleanData = cleanUp(description)
    count = CountVectorizer()
    dfCopy = df.copy()
    dfCopy.loc["TEMP_SUBJECT"] = cleanData
    count_matrix = count.fit_transform(dfCopy['bagOfWords'])
    cosine_sim = cosine_similarity(count_matrix, count_matrix)
    indices = pd.Series(dfCopy.index)
    recommended_modules = []
    score_series = pd.Series(cosine_sim[len(cosine_sim) - 1]).sort_values(ascending = False)
    top_indices = list(score_series.iloc[1:amount].index)
    for j in top_indices:
        recommended_modules.append(list(df.index)[j])
    return recommended_modules

In [17]:
# recommendSimilarModules('grunt', 250)
recommendModules('javascript template for message queues or frontend react app', 25)

['normalize.css',
 'apache-server-configs',
 'archiver',
 'del',
 'eslint-config-recommended',
 'gulp-autoprefixer',
 'gulp-header',
 'gulp-load-plugins',
 'main.css',
 'modernizr',
 'ssri',
 'grunt-contrib-uglify',
 'grunt-compare-size',
 'raw-body',
 'grunt-git-authors',
 'grunt-jsonlint',
 'grunt-newer',
 'gzip-js',
 'insight',
 'karma-jsdom-launcher',
 'karma-qunit',
 'native-promise-only',
 'testswarm',
 'strip-json-comments']