## Load Required Library

In [1]:
import pandas as pd
import os
import ast
import json
import re
import numpy as np
import random
import string
import pickle
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
import gensim
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity

# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# %matplotlib inline

import warnings
warnings.simplefilter('ignore')

## Data Preparation

### 01-Read Raw Data

In [2]:
path = './Data/'

#do this to simplify csv, make sure you have selected.csv and projects_with_repository_fields-1.6.0-2020-01-12.csv in Data folder
#data = pd.read_csv(path + 'projects_with_repository_fields-1.6.0-2020-01-12.csv',
                #usecols=['ID', 'Name', 'Description', 'Repository URL', 'Language'], sep=',', keep_default_na=False, index_col=False)

# data.to_csv(path + 'selected.csv', index=False)


data = pd.read_csv(path + 'selected_2.csv', sep=',', keep_default_na=False)
# len(data)
data.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Description,Repository URL,Language
0,181,187,XprobePlugin,Xcode extension for viewing an application's o...,https://github.com/johnno1962/XprobePlugin,JavaScript
1,388,394,fluent-logger,A structured logger for Fluentd.,https://github.com/fluent/fluent-logger-d,JavaScript
2,484,490,oculus-d-rift,A D binding to the Oculus Rift API.,https://github.com/Circular-Studios/Oculus-D-Rift,C++
3,529,535,socket.io,Socket.Io implementation for vibe.d,https://github.com/eldar/socket.io-d,JavaScript
4,579,585,vibenotes,Embeddable real-time collaborative text editor,https://github.com/rejectedsoftware/vibenotes,JavaScript


### 02-Data Preprocessing

In [3]:
#https://githut.info/ I picked top lang based on this, maybe we will need to clean up the data to more to include only these lang

langStr = '(JavaScript)(Python)(Java)(PHP)(C++)(Ruby)'

def pre_process(desc):
    stop_words = stopwords.words("english")
    
    # Remove non english words
    desc = [re.sub('[^a-z' + langStr + ']', ' ', x.lower()) for x in desc]
    # Tokenlization
    desc_tokens = [nltk.word_tokenize(t) for t in desc]
    # Removing Stop Words
    desc_stop = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)]
                      for tokens in desc_tokens]
    
    desc_stop = pd.Series(desc_stop)
    return desc_stop

In [4]:
# Initial preprocessing training data
desc = data['Description']
desc_pp = pre_process(desc)

In [5]:
data_tokens = pd.DataFrame({'ID': list(data['ID']),
                            'Name': list(data['Name']),
                            'Desc_Tokens': desc_pp,
                            'Description': list(data['Description']),
                            'RepoUrl': list(data['Repository URL']),
                            'Lang': list(data['Language'])
                           })
data_tokens.head()
data_tokens.to_csv(path + 'tokenized_2.csv', index=False)


In [6]:
data_tokens = pd.read_csv(path + 'tokenized_2.csv', sep=',', keep_default_na=False)
data_tokens['Desc_Tokens'] = data_tokens['Desc_Tokens'].apply(ast.literal_eval)
data_tokens = data_tokens[data_tokens['Lang'] == 'Python']
data_tokens.head()

Unnamed: 0,ID,Name,Desc_Tokens,Description,RepoUrl,Lang
983,2661,angular-django-rest-resource,"[angularjs, module, provides, resource, genera...",An AngularJS module that provides a resource-g...,https://github.com/blacklocus/angular-django-r...,Python
5357,9284,django-autocomplete-light,"[fresh, approach, autocomplete, specially, dja...",A fresh approach to autocomplete implementatio...,https://github.com/yourlabs/django-autocomplet...,Python
7810,12777,icinga2tst,"[icinga, auto, commit, push]",for Icinga2 auto commit and push,https://github.com/alberthan/Icinga2Tst.git,Python
9210,14699,jquery-pjaxr,"[pushstate, ajax, extended, replacements]",Pushstate aJAX eXtended Replacements,https://github.com/minddust/jquery-pjaxr.git,Python
10312,16109,kotti,"[user, friendly, light, weight, extensible, co...","A user-friendly, light-weight and extensible w...",https://github.com/disko/Kotti.git,Python


### 03-Example

In [7]:
data_example = pd.DataFrame(data_tokens['Description'])
length = data_example['Description'].apply(len)
data_example = data_example.assign(Desc_Length=length)
data_example.head()

Unnamed: 0,Description,Desc_Length
983,An AngularJS module that provides a resource-g...,131
5357,A fresh approach to autocomplete implementatio...,191
7810,for Icinga2 auto commit and push,32
9210,Pushstate aJAX eXtended Replacements,36
10312,"A user-friendly, light-weight and extensible w...",128


In [9]:
# Raw data
example = data_example['Description'].iloc[1]
raw_title = 'Raw Data'
raw_result = example
raw_result

'A fresh approach to autocomplete implementations, specially for Django. Status: v3 stable, 2.x.x stable, 1.x.x deprecated. Please DO regularely ping us with your link at #yourlabs IRC channel'

In [10]:
# Remove non english words
re_title = 'Remove non-English Words'
re_result = [re.sub('[^a-z(JavaScript)(Python)(Java)(PHP)(C++)(Ruby)]', ' ', x.lower()) for x in pd.Series(example)]
re_result

['a fresh approach to autocomplete implementations  specially for django  status  v  stable    x x stable    x x deprecated  please do regularely ping us with your link at  yourlabs irc channel']

In [11]:
# Tokenlization
tk_title = 'Tokenlization'
tk_result = [nltk.word_tokenize(t) for t in re_result]
print(tk_result)

[['a', 'fresh', 'approach', 'to', 'autocomplete', 'implementations', 'specially', 'for', 'django', 'status', 'v', 'stable', 'x', 'x', 'stable', 'x', 'x', 'deprecated', 'please', 'do', 'regularely', 'ping', 'us', 'with', 'your', 'link', 'at', 'yourlabs', 'irc', 'channel']]


In [12]:
# Removing Stop Words
stop_words = stopwords.words("english")
rs_title = 'Removing Stop Words'
rs_result = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)] for tokens in tk_result]
rs_result

[['fresh',
  'approach',
  'autocomplete',
  'specially',
  'django',
  'status',
  'stable',
  'stable',
  'deprecated',
  'please',
  'regularely',
  'ping',
  'link',
  'yourlabs',
  'channel']]

In [13]:
data = {'Step' : [raw_title, re_title, tk_title, rs_title],
        'Results' : [raw_result, re_result, tk_result, rs_result]}
df = pd.DataFrame(data)
cols = ['Step', 'Results']
df = df.iloc[:,0:2]
pd.set_option('display.max_colwidth', 100)
df

Unnamed: 0,Step
0,Raw Data
1,Remove non-English Words
2,Tokenlization
3,Removing Stop Words


### 03-Train Word2Vec

In [None]:
def train_model(train_data):
    """Function trains and creates Word2vec Model using parsed
    data and returns trained model"""
    model = gensim.models.Word2Vec(train_data, min_count=5)
    return model

In [None]:
#dict_language = {'0': 'Python', '1': 'C++', '2': 'C#', '3': 'Java', '4': 'TypeScript', '5': 'Shell', '6': 'C', 
 #                '7': 'Ruby', '8': 'PHP', '9': 'JavaScript', '10': 'CSS', '11': 'Go' }
dict_language = {'0': 'Python' }

for key, value in dict_language.items():
    desc_data = list(data_tokens[data_tokens['Lang'] == value]['Desc_Tokens'])

    # Train model
    model_name = 'word2vec_model_' + value
    trained_model = train_model(desc_data)
    trained_model.save(model_name)
    print('Saved %s model successfully' % model_name)
    
    # Save Word2Vec model
    word2vec_pickle_path = path + 'desc_word2vec_' + value + '.bin'
    f = open(word2vec_pickle_path, 'wb')
    pickle.dump(trained_model, f) 
    f.close()

In [None]:
#dict_language = {'0': 'Python', '1': 'C++', '2': 'C#', '3': 'Java', '4': 'TypeScript', '5': 'Shell', '6': 'C', 
 #                '7': 'Ruby', '8': 'PHP', '9': 'JavaScript', '10': 'CSS', '11': 'Go' }
dict_language = {'0': 'Python' }

data_tokens['Desc_Vectors'] = None
data_tokens['Average_Pooling'] = None

for key, value in dict_language.items():
    word2vec_pickle_path = path + 'desc_word2vec_' + value + '.bin'
    
    model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
    
    # Calculate the vectors for each question
    for i in range(len(data_tokens)):
        if data_tokens['Lang'].iloc[i] == value:
            desc_tokens = data_tokens['Desc_Tokens'].iloc[i]
            desc_vectors = []
            for token in desc_tokens:
                try:
                    vector = model[token]
                    desc_vectors.append(vector)
                except:
                    continue
            # Vectors for each tokens
            data_tokens['Desc_Vectors'].iloc[i] = desc_vectors
            # Average Pooling of all tokens
            data_tokens['Average_Pooling'].iloc[i] = list(pd.DataFrame(desc_vectors).mean())

In [None]:
data_tokens['Desc_Tokens'] = [" ".join(l) for l in data_tokens['Desc_Tokens']]
length = data_tokens['Desc_Tokens'].apply(len)
data_tokens = data_tokens.assign(Desc_Length=length)
data_tokens.head()

In [None]:
# Export as data as JSON
data_json = json.loads(data_tokens.to_json(orient='records'))

with open(path + 'Desc_Word2Vec.json', 'w') as outfile:
    json.dump(data_json, outfile)

## Word2Vec

In [None]:
try:
    desc_json_path = path + 'Desc_Word2Vec.json'

    with open(desc_json_path) as file:
        reader = json.load(file)

        langs = []
        descriptions = []
        desc_tokens = []
        ids = []
        names = []
        repourls=[]
        desc_lengths = []
        desc_vectors = []
        average_pooling = []
        for row in reader:
            lang.append(row['Lang'])
            desc.append(row['Description'])
            desc_tokens.append(row['Desc_Tokens'].split())
            ids.append(row['ID'])
            names.append(row['Name'])
            repourls.append(row['RepoUrl'])
            desc_lengths.append(row['Desc_Length'])
            desc_vectors.append(row['Desc_Vectors'])
            average_pooling.append(row['Average_Pooling'])

        data_tokens = pd.DataFrame({'Lang': lang,
                                    'Description': desc,
                                    'Desc_Tokens': desc_tokens,
                                    'ID': ids,
                                    'Name': names,
                                    'RepoUrl': repourls,
                                    'Desc_Length': desc_lengths,
                                    'Desc_Vectors': desc_vectors,
                                    'Average_Pooling': average_pooling})
except:
    pass

data_tokens.head()

In [None]:
# Greeting function
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i of help?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
           
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [None]:
def matchRepoToInput(data_by_language, model):
    
    # Preprocessing of user input
    sentence_pp = pre_process(pd.Series(sentence)) 

    cosines = []
    try:
        # Get vectors and average pooling
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model[token]
                question_vectors.append(vector)
            except:
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean())

        # Calculate cosine similarity
        for t in data_by_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t])
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except:
        pass
            
    # If not in the topic trained
    if len(cosines) == 0:
        not_understood = "Apology, I do not understand. Can you rephrase?"
        return not_understood, 999
    
    else: 
        # Sort similarity
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0.9:
                index_s.append(i)
                score_s.append(cosines[i])

        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)

        # Find Top Questions and Score
        r_index = int(reply_indexes['index'].iloc[0])
        r_score = float(reply_indexes['score'].iloc[0])

        #reply = str(data_by_language.iloc[:,0][r_index])
        reply = str(data_by_language['Description'].iloc[r_index] + data_by_language['Name'].iloc[r_index] + data_by_language['RepoUrl'].iloc[r_index])
        
        return reply, r_score

In [None]:
flag_language = True
flag_query = True
dict_language = {'0': 'Python', '1': 'C++', '2': 'C#', '3': 'Java', '4': 'TypeScript', '5': 'Shell', '6': 'C', 
                 '7': 'Ruby', '8': 'PHP', '9': 'JavaScript', '10': 'CSS', '11': 'Go', '12': 'Swift' }

print('......................................................................................')
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Hi, ask me something.')
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'If you want to exit, you can type < bye >.')

while(flag_language == True):
    print("......................................................................................")
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Please select which language you want to enquire, ' +
      'you can type:')
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + '< 0 > for python     < 1 > for c++      < 2 > for c#')
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + '< 3 > for java       < 4 > for typescript      < 5 > for shell')
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + '< 6 > for c       < 7 > for ruby   < 8 > for php')
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + '< 9 > for javascript       < 10 > for css   < 11 > for go')
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + '< 12 > for swift')
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")
    
    if(sentence.lower() != 'bye'):
        if (sentence.lower() in list(dict_language.keys())):
            language = dict_language[sentence.lower()]
            data_by_language = data_tokens[data_tokens['Lang'] == language]
            data_by_language = pd.DataFrame({'Description': list(data_by_language['Description']),
                                          'Desc_Tokens': list(data_by_language['Desc_Tokens']),
                                          'ID': list(data_by_language['ID']),
                                          'Name': list(data_by_language['Name']),
                                          'RepoUrl': list(data_by_language['RepoUrl']),
                                          'Lang': list(data_by_language['Lang']),
                                          'Desc_Vectors': list(data_by_language['Desc_Vectors']),
                                          'Average_Pooling': list(data_by_language['Average_Pooling'])
                                         })
            
            # Read word2vec model
            word2vec_pickle_path = path + 'desc_word2vec_' + language + '.bin'
            model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
            
            flag_language = False
            flag_query = True
    else:
        flag_language = False
        flag_query = False

print("......................................................................................")
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Let''s start! Please input your question now.')
    
while(flag_query == True):
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'Me  ' + '\x1b[0m' + ':')
    print("......................................................................................")

    if(sentence.lower() != 'bye'):
        if(greeting(sentence.lower()) != None):
            print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + greeting(sentence.lower()))
        else:
            reply, score = matchRepoToInput(data_by_language, model)
            print('\x1b[1;37;40m' + 'Bot'+'\x1b[0m'+': '+reply)

            #For Tracing, comment to remove from print 
            #print("")
            #print("SCORE: " + str(score))
    else:
        flag_query = False
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Bye! Hope that i am of help.') 