## Load Required Library

In [None]:
import pandas as pd
import os
import ast
import json
import re
import numpy as np
import random
import string
import pickle
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
import gensim
from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity

# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# %matplotlib inline

import warnings
warnings.simplefilter('ignore')

## Data Preparation

### 01-Read Raw Data

In [None]:
path = './Data/'

#do this to simplify csv, make sure you have selected.csv and projects_with_repository_fields-1.6.0-2020-01-12.csv in Data folder
#data = pd.read_csv(path + 'projects_with_repository_fields-1.6.0-2020-01-12.csv',
                #usecols=['ID', 'Name', 'Description', 'Repository URL', 'Language'], sep=',', keep_default_na=False, index_col=False)
#data = data[data['Language'].isin(['JavaScript','Python', 'Java', 'PHP', 'C++'])]
#data.to_csv(path + 'selected.csv', index=False)


#data = pd.read_csv(path + 'selected.csv', sep=',', keep_default_na=False)
data = pd.read_csv(path + 'libdata0.csv', sep=',', keep_default_na=False)
len(data)
data.head()

### 02-Data Preprocessing

In [None]:
#https://githut.info/ I picked top lang based on this, maybe we will need to clean up the data to more to include only these lang

langStr = '(JavaScript)(Python)(Java)(PHP)(C++)'

def pre_process(desc):
    stop_words = stopwords.words("english")
    
    # Remove non english words
    desc = [re.sub('[^a-z' + langStr + ']', ' ', x.lower()) for x in desc]
    # Tokenlization
    desc_tokens = [nltk.word_tokenize(t) for t in desc]
    # Removing Stop Words
    desc_stop = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)]
                      for tokens in desc_tokens]
    
    desc_stop = pd.Series(desc_stop)
    return desc_stop

In [None]:
# Initial preprocessing training data
desc = data['description']
desc_pp = pre_process(desc)

In [None]:
data_tokens = pd.DataFrame({'ID': list(data['id']),
                            'Name': list(data['name']),
                            'Desc_Tokens': desc_pp,
                            'Description': list(data['description']),
                            'RepoUrl': list(data['repository_url']),
                            'Lang': list(data['language'])
                           })
data_tokens.head()
data_tokens.to_csv(path + 'tokenized.csv', index=False)


In [None]:
data_tokens = pd.read_csv(path + 'tokenized.csv', sep=',', keep_default_na=False)
data_tokens['Desc_Tokens'] = data_tokens['Desc_Tokens'].apply(ast.literal_eval)
data_tokens.head()
#print(len(data_tokens))

### 03-Train Word2Vec

In [None]:
def train_model(train_data):
    """Function trains and creates Word2vec Model using parsed
    data and returns trained model"""
    model = gensim.models.Word2Vec(train_data, min_count=3, size=80, window=2, sg=1, hs=1)
    return model

In [None]:
#dict_language = {'0': 'Python', '1': 'C++', '2': 'C#', '3': 'Java', '4': 'TypeScript', '5': 'Shell', '6': 'C', 
 #                '7': 'Ruby', '8': 'PHP', '9': 'JavaScript', '10': 'CSS', '11': 'Go' }
dict_language = {'0': 'Python', '1': 'JavaScript', '2': 'Java', '3': 'PHP', '4': 'C++'}

for key, value in dict_language.items():
    desc_data = list(data_tokens[data_tokens['Lang'] == value]['Desc_Tokens'])

    # Train model
    model_name = 'word2vec_model_' + value
    trained_model = train_model(desc_data)
    trained_model.save(model_name)
    print('Saved %s model successfully' % model_name)
    
    # Save Word2Vec model
    word2vec_pickle_path = path + 'desc_word2vec_' + value + '.bin'
    f = open(word2vec_pickle_path, 'wb')
    pickle.dump(trained_model, f) 
    f.close()

In [None]:
#dict_language = {'0': 'Python', '1': 'C++', '2': 'C#', '3': 'Java', '4': 'TypeScript', '5': 'Shell', '6': 'C', 
 #                '7': 'Ruby', '8': 'PHP', '9': 'JavaScript', '10': 'CSS', '11': 'Go' }
dict_language = {'0': 'Python', '1': 'JavaScript', '2': 'Java', '3': 'PHP', '4': 'C++'}

#data_tokens['Desc_Vectors'] = None
data_tokens['Average_Pooling'] = None

for key, value in dict_language.items():
    word2vec_pickle_path = path + 'desc_word2vec_' + value + '.bin'
    
    model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
    
    # Calculate the vectors for each question
    for i in range(len(data_tokens)):
        if data_tokens['Lang'].iloc[i] == value:
            desc_tokens = data_tokens['Desc_Tokens'].iloc[i]
            desc_vectors = []
            for token in desc_tokens:
                try:
                    vector = model[token]
                    desc_vectors.append(vector)
                except:
                    continue
            # Vectors for each tokens
            #data_tokens['Desc_Vectors'].iloc[i] = desc_vectors
            # Average Pooling of all tokens
            data_tokens['Average_Pooling'].iloc[i] = list(pd.DataFrame(desc_vectors).mean())

In [None]:
data_tokens['Desc_Tokens'] = [" ".join(l) for l in data_tokens['Desc_Tokens']]
length = data_tokens['Desc_Tokens'].apply(len)
data_tokens = data_tokens.assign(Desc_Length=length)
data_tokens.head()

In [None]:
#data_tokens.to_csv(path + 'tokenized_withvectors.csv', index=False)
# Export as data as JSON
data_json = json.loads(data_tokens.to_json(orient='records'))

with open(path + 'Desc_Word2Vec.json', 'w') as outfile:
    json.dump(data_json, outfile)

## Word2Vec

In [None]:
try:
    desc_json_path = path + 'Desc_Word2Vec.json'
    #desc_json_path = path + 'lib4_Desc_Word2Vec.json'

    with open(desc_json_path) as file:
        reader = json.load(file)

        langs = []
        descriptions = []
        desc_tokens = []
        ids = []
        names = []
        repourls=[]
        desc_lengths = []
        #desc_vectors = []
        average_pooling = []
        for row in reader:
            langs.append(row['Lang'])
            descriptions.append(row['Description'])
            desc_tokens.append(row['Desc_Tokens'].split())
            ids.append(row['ID'])
            names.append(row['Name'])
            repourls.append(row['RepoUrl'])
            desc_lengths.append(row['Desc_Length'])
            #desc_vectors.append(row['Desc_Vectors'])
            average_pooling.append(row['Average_Pooling'])

        data_tokens = pd.DataFrame({'Lang': langs,
                                    'Description': descriptions,
                                    'Desc_Tokens': desc_tokens,
                                    'ID': ids,
                                    'Name': names,
                                    'RepoUrl': repourls,
                                    'Desc_Length': desc_lengths,
                                    #'Desc_Vectors': desc_vectors,
                                    'Average_Pooling': average_pooling})
except:
    pass

data_tokens.head()

In [None]:
# Greeting function
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i of help?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
           
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [None]:
def matchRepoToInput(data_by_language, model):
    
    # Preprocessing of user input
    sentence_pp = pre_process(pd.Series(sentence)) 

    cosines = []
    try:
        # Get vectors and average pooling
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model[token]
                question_vectors.append(vector)
            except:
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean())

        # Calculate cosine similarity
        for t in data_by_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t])
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except:
        pass
            
    # If not in the topic trained
    if len(cosines) == 0:
        not_understood = "Apology, I do not understand. Can you rephrase?"
        return not_understood, 999
    
    else: 
        # Sort similarity
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0.9:
                index_s.append(i)
                score_s.append(cosines[i])
        
        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)
        
        if len(reply_indexes) == 0:
            not_understood = "Apology, I do not understand. Can you rephrase?"
            return not_understood, 999
        else:
            # Find Top Questions and Score
            r_index = int(reply_indexes['index'].iloc[0])
            r_score = float(reply_indexes['score'].iloc[0])

            #reply = str(data_by_language.iloc[:,0][r_index])
            reply = str("My suggestion: " + data_by_language['Description'].iloc[r_index] + ". Repo name: " + data_by_language['Name'].iloc[r_index] + ". Repo URL: " + data_by_language['RepoUrl'].iloc[r_index])

            return reply, r_score

In [None]:
flag_language = True
flag_query = True
dict_language = {'0': 'Python', '1': 'JavaScript', '2': 'Java', '3': 'PHP', '4': 'C++'}

print('......................................................................................')
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Hi, ask me something.')
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'If you want to exit, you can type < bye >.')

while(flag_language == True):
    print("......................................................................................")
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Please select which language you want to enquire, ' +
      'you can type:')
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + '< 0 > for python     < 1 > for js      < 2 > for java')
    print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + '< 3 > for php       < 4 > for c++')
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")
    
    if(sentence.lower() != 'bye'):
        if (sentence.lower() in list(dict_language.keys())):
            language = dict_language[sentence.lower()]
            data_by_language = data_tokens[data_tokens['Lang'] == language]
            data_by_language = pd.DataFrame({'Description': list(data_by_language['Description']),
                                          'Desc_Tokens': list(data_by_language['Desc_Tokens']),
                                          'ID': list(data_by_language['ID']),
                                          'Name': list(data_by_language['Name']),
                                          'RepoUrl': list(data_by_language['RepoUrl']),
                                          'Lang': list(data_by_language['Lang']),
                                          #'Desc_Vectors': list(data_by_language['Desc_Vectors']),
                                          'Average_Pooling': list(data_by_language['Average_Pooling'])
                                         })
            
            # Read word2vec model
            word2vec_pickle_path = path + 'desc_word2vec_' + language + '.bin'
            model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
            
            flag_language = False
            flag_query = True
    else:
        flag_language = False
        flag_query = False

print("......................................................................................")
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Let''s start! Please input your question now.')
    
while(flag_query == True):
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'Me  ' + '\x1b[0m' + ':')
    print("......................................................................................")

    if(sentence.lower() != 'bye'):
        if(greeting(sentence.lower()) != None):
            print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + greeting(sentence.lower()))
        else:
            reply, score = matchRepoToInput(data_by_language, model)
            print('\x1b[1;37;40m' + 'Bot'+'\x1b[0m'+': '+reply)

            #For Tracing, comment to remove from print 
            #print("")
            #print("SCORE: " + str(score))
    else:
        flag_query = False
print('\x1b[1;37;40m' + 'Bot' + '\x1b[0m' + ': ' + 'Bye! Hope that i am of help.') 