In [1]:
import pandas as pd
from langdetect import detect
import string
import emoji
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
import requests
import boto3
import json

import os # for touching a file to restart when new model file is generatd

import nltk
from nltk import word_tokenize 
from nltk.stem import WordNetLemmatizer 
nltk.download('punkt')
nltk.download('wordnet')

# for cortex
import cloudpickle

pd.set_option('display.max_rows',500)

[nltk_data] Downloading package punkt to /Users/dlite/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/dlite/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
########################################################
### Import Dataset
########################################################

In [3]:
raw_github_data = pd.read_csv('../data/external/2020-04-06.tsv', sep='\t', header=0)
raw_github_data.head(2)

Unnamed: 0,github_repo_url,repo_description,topics,owner_repo_name,owner_name,owner_type,organization_bio,repo_created_day,primary_language_name,license_name,...,count_of_stars,count_of_watchers,count_distinct_contributors,count_contributions,count_commits,count_commit_comments,count_created_issues,count_pull_requests_created,count_pull_requests_reviews,count_comments_on_issues_and_pull_requests
0,https://github.com/CSSEGISandData/COVID-19,"novel coronavirus (covid-19) cases, provided b...","systems-science, covid-19, johns-hopkins-unive...",CSSEGISandData/COVID-19,CSSEGISandData,User,,2020-02-04,,,...,19434,19417,2746,11609,3256,152,1669,361,119,6052
1,https://github.com/phildini/stayinghomeclub,a list of all the companies wfh or events chan...,"remote-work, covid19, covid-19, static-site",phildini/stayinghomeclub,phildini,User,,2020-03-04,Ruby,cc0-1.0,...,456,453,1091,4156,1293,3,78,1350,934,498


In [4]:
########################################################
### Clean Dataset
########################################################

In [5]:
# Filtering down to repos that are likely needing contributors based on past behavior
raw_github_data_filtered = raw_github_data[(raw_github_data['has_merged_prs'] == True) &
    (raw_github_data['has_readme'] == True) &
    (pd.isna(raw_github_data['repo_description']) == False) &
    (pd.isna(raw_github_data['primary_language_name']) == False) &
    (raw_github_data['count_distinct_contributors'] >=2)
]

In [6]:
raw_github_data_filtered.fillna(value = '',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


In [7]:
# Detect language with error handling
def detect_with_error_handle(x):
    try:
        return detect(x)
    except:
        return 'Error'
    
# Check for only latin characters
def has_only_latin_letters(text):
    char_set = string.printable + '—'
    return all((True if x in char_set else False for x in text))

# Remove punctuation
def remove_punctuation(text):
    punctuation_list = string.punctuation + '—'
    return text.translate(str.maketrans('', '', punctuation_list))

In [8]:
## Full set of text processing

# check language, limit to english, and limit repo's with latin characters. Emojis are converted in the process
raw_github_data_filtered['language'] = raw_github_data_filtered['repo_description'].apply(lambda x: 'None' if pd.isna(x) else detect_with_error_handle(str(x)))
raw_github_data_filtered = raw_github_data_filtered[raw_github_data_filtered['language'] == 'en'].copy()
raw_github_data_filtered['is_latin_only_characters'] = raw_github_data_filtered['repo_description'].apply(lambda x: has_only_latin_letters(emoji.demojize(x)))
raw_github_data_filtered = raw_github_data_filtered[raw_github_data_filtered['is_latin_only_characters'] == True].copy()

# clean up repo description, topic, and language, combine into one big bag o' words
raw_github_data_filtered['repo_description_cleaned'] = raw_github_data_filtered['repo_description'].apply(lambda x: remove_punctuation(x))
raw_github_data_filtered['topics'] = raw_github_data_filtered.apply(lambda x: remove_punctuation(str(x['topics']).replace(',','').replace('nan','')), axis=1)
raw_github_data_filtered['topics'].fillna('', inplace=True)
raw_github_data_filtered['description_plus_topics'] = raw_github_data_filtered['repo_description_cleaned']+' '+raw_github_data_filtered['topics']+' '+raw_github_data_filtered['primary_language_name']
raw_github_data_filtered.reset_index(drop=True, inplace=True)

repo_lookup = raw_github_data_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [9]:
########################################################
### Tokenize
########################################################

In [10]:
# Create class to be used by tokenizer to lemmatize... which change matches words to their roots
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]

In [11]:
# Create a list of stop words that should be removed before tokenizing
stopwords = list(ENGLISH_STOP_WORDS) + ['covid19','coronavirus','virus','corona','covid','pandemic','sarscov2','outbreak','19','disease','2019','2019ncov','cord19','repository','repo','2020','20','covid2019','covidvirus', 'cases','case']

# Create vectorizor of n-grams using stop words and lemmatizer
word_vectorizer = CountVectorizer(ngram_range=(1,1), analyzer='word',stop_words=stopwords, tokenizer=LemmaTokenizer())

# Fit vectorizer on existing list of repos and create sparse matrix
sparse_vector_matrix = word_vectorizer.fit_transform(raw_github_data_filtered['description_plus_topics'])

  'stop_words.' % sorted(inconsistent))


In [12]:
########################################################
### Build predict function
########################################################

In [13]:
def text_recommender(input_df, word_vectorizer=word_vectorizer,  sparse_vector_matrix = sparse_vector_matrix, repo_lookup=repo_lookup):
    
    input_df['bag_of_words'] = input_df.apply(lambda x: ' '.join(x), axis = 1)
    
    # vectorize the inputted string
    #inputted_vector = word_vectorizer.transform(pd.Series(str(input_string)))
    inputted_vector = word_vectorizer.transform(input_df['bag_of_words'])
    
    # calculate cosine similarity with existing matrix
    one_dimension_cosine_sim = cosine_similarity(inputted_vector, sparse_vector_matrix)

    # creating a Series with the similarity scores in descending order
    score_series = pd.Series(one_dimension_cosine_sim[0]).sort_values(ascending = False)
    # only show matches that have some similarity
    score_series = score_series[score_series>0]

    # getting the indexes of the 10 most similar movies
    top_10_indexes = list(score_series.iloc[1:11].index)
    
    # initializing the empty list of recommended repo
    recommended_repos = repo_lookup.loc[top_10_indexes]
    return recommended_repos

In [14]:
class covid19RepoReco:
   
    ## defining objects needed for leadsModel prediction. 
    def __init__(self,
                 word_vectorizer,
                 sparse_vector_matrix,
                 repo_lookup,
                 text_recommender):
        
        ## Setting up all needed objects
        self.word_vectorizer = word_vectorizer
        self.sparse_vector_matrix = sparse_vector_matrix
        self.repo_lookup = repo_lookup
        self.text_recommender = text_recommender
    
    ## define function with processing and feeding data into prediction at the end
    def predict(self,context,model_input):
        output_df = self.text_recommender(model_input)
        return [output_df.to_dict('records')]

In [15]:
# Initialize the model
m = covid19RepoReco(word_vectorizer = word_vectorizer,
                                       sparse_vector_matrix = sparse_vector_matrix,
                                       repo_lookup = repo_lookup,
                                       text_recommender = text_recommender)

# Dump the model to disk
cloudpickle.dump(m, open("../models/model.pkl", "wb"))

# Forces uvicorn to restart (if running) as it currently only watches py files
# https://github.com/encode/uvicorn/pull/521/files
os.system("touch ../models/restart.py")

# Test the model
input = pd.DataFrame([["Python", "Data"]])
output = m.predict(None,input)
output



[[{'github_repo_url': 'https://github.com/mmarchegiani/covid-19',
   'repo_description': 'data analysis of the covid-19 outbreak',
   'topics': '',
   'owner_repo_name': 'mmarchegiani/covid-19',
   'owner_name': 'mmarchegiani',
   'owner_type': 'User',
   'organization_bio': '',
   'repo_created_day': '2020-03-07',
   'primary_language_name': 'Python',
   'license_name': '',
   'is_github_pages': False,
   'has_readme': True,
   'has_wiki': True,
   'has_merged_prs': True,
   'has_issues': True,
   'has_contributor_guide': False,
   'has_code_of_conduct': False,
   'count_of_public_forks': 2,
   'count_of_stars': 0,
   'count_of_watchers': 0,
   'count_distinct_contributors': 6,
   'count_contributions': 26,
   'count_commits': 17,
   'count_commit_comments': 0,
   'count_created_issues': 0,
   'count_pull_requests_created': 9,
   'count_pull_requests_reviews': 0,
   'count_comments_on_issues_and_pull_requests': 0,
   'language': 'en',
   'is_latin_only_characters': True,
   'repo_desc