## Load Required Library

In [14]:
import pandas as pd
import os
import json
import re
import numpy as np
import random
import string
import pickle
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk import pos_tag
# import gensim
# from gensim import corpora, models, similarities
from sklearn.metrics.pairwise import cosine_similarity

# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
# %matplotlib inline

import warnings
warnings.simplefilter('ignore')

## Data Preparation

### 01-Read Raw Data

In [36]:
path = './Data/'

#do this to simplify csv, make sure you have selected.csv and projects_with_repository_fields-1.6.0-2020-01-12.csv in Data folder
#data = pd.read_csv(path + 'projects_with_repository_fields-1.6.0-2020-01-12.csv',
                #usecols=['ID', 'Name', 'Description', 'Repository URL', 'Language'], sep=',', keep_default_na=False, index_col=False)

# data.to_csv(path + 'selected.csv', index=False)


data = pd.read_csv(path + 'selected.csv', sep=',', keep_default_na=False)
# len(data)
data.head()

Unnamed: 0,ID,Name,Description,Repository URL,Language
0,1,21st digital Templates,"A starting point for stripped down, structured...",https://github.com/21stdigital/Xcode-Templates,Objective-C
1,2,ACCodeSnippetRepository,Synchronize code snippets with a git repository.,https://github.com/acoomans/ACCodeSnippetRepos...,Objective-C
2,3,AdjustFontSize,Adjust font size with ⌃ + / ⌃ -,https://github.com/zats/AdjustFontSize-Xcode-P...,Objective-C
3,4,AeroGear Template,"Setup for your AeroGear projects, based on Coc...",https://github.com/aerogear/aerogear-ios-xcode...,Objective-C
4,6,AMMethod2Implement,A simple Xcode plugin to generate implement co...,https://github.com/MellongLau/AMMethod2Implement,Objective-C


### 02-Data Preprocessing

In [41]:
#https://githut.info/ I picked top lang based on this, maybe we will need to clean up the data to more to include only these lang

langStr = '(JavaScript)(Python)(Java)(PHP)(C#)(C++)(TypeScript)(Shell)(C)(Ruby)(CSS)(Go)(Swift)'

def pre_process(desc):
    stop_words = stopwords.words("english")
    
    # Remove non english words
    desc = [re.sub('[^a-z' + langStr + ']', ' ', x.lower()) for x in desc]
    # Tokenlization
    desc_tokens = [nltk.word_tokenize(t) for t in desc]
    # Removing Stop Words
    desc_stop = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)]
                      for tokens in desc_tokens]
    
    desc_stop = pd.Series(desc_stop)
    return desc_stop

In [None]:
# Initial preprocessing training data
desc = data['Description']
desc_pp = pre_process(desc)

In [None]:
data_tokens = pd.DataFrame({'ID': list(data['ID']),
                            'Name': list(data['Name']),
                            'Desc_Tokens': desc_pp,
                            'Description': list(data['Description']),
                            'RepoUrl': list(data['Repository URL']),
                            'Lang': list(data['Language'])
                           })
data_tokens.head()
data_tokens.to_csv(path + 'tokenized.csv', index=False)

### 03-Example

In [None]:
data_example = pd.DataFrame(data_tokens['Description'])
length = data_example['Description'].apply(len)
data_example = data_example.assign(Question_Length=length)
data_example.head()

In [None]:
# Raw data
example = data_example['Description'][1]
raw_title = 'Raw Data'
raw_result = example
raw_result

In [None]:
# Remove non english words
re_title = 'Remove non-English Words'
re_result = [re.sub('[^a-z(c++)(c#)]', ' ', x.lower()) for x in pd.Series(example)]
re_result

In [None]:
# Tokenlization
tk_title = 'Tokenlization'
tk_result = [nltk.word_tokenize(t) for t in re_result]
print(tk_result)

In [None]:
# Removing Stop Words
stop_words = stopwords.words("english")
rs_title = 'Removing Stop Words'
rs_result = [[t for t in tokens if (t not in stop_words) and (3 < len(t.strip()) < 15)] for tokens in tk_result]
rs_result

In [None]:
data = {'Step' : [raw_title, re_title, tk_title, rs_title],
        'Results' : [raw_result, re_result, tk_result, rs_result]}
df = pd.DataFrame(data)
cols = ['Step', 'Results']
df = df.ix[:,cols]
pd.set_option('display.max_colwidth', 100)
df

### 03-Train Word2Vec

In [None]:
def train_model(train_data):
    """Function trains and creates Word2vec Model using parsed
    data and returns trained model"""
    model = gensim.models.Word2Vec(train_data, min_count=2)
    return model

In [None]:
dict_language = {'0': 'python', '1': 'c++', '2': 'c#', '3': 'java', '4': 'ios', '5': 'android', '6': 'html', 
                 '7': 'jquery', '8': 'php', '9': 'javascript'}

data_tokens['Question_Vectors'] = None
data_tokens['Average_Pooling'] = None
    
for key, value in dict_language.items():
    questions_data = list(data_tokens[data_tokens['Class'] == value]['Question_Tokens'])
    # Train model
    model_name = 'word2vec_model_' + value
    trained_model = train_model(questions_data)
    trained_model.save(model_name)
    print('Saved %s model successfully' % model_name)
    
    # Save Word2Vec model
    word2vec_pickle_path = path + 'stackoverflow_word2vec_' + value + '.bin'
    f = open(word2vec_pickle_path, 'wb')
    pickle.dump(trained_model, f) 
    f.close()
    
    model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
    
    # Calculate the vectors for each question
    for i in range(len(data_tokens)):
        if data_tokens['Class'][i] == value:
            question_tokens = data_tokens['Question_Tokens'][i]
            question_vectors = []
            for token in question_tokens:
                try:
                    vector = model[token]
                    question_vectors.append(vector)
                except:
                    continue
            # Vectors for each tokens
            data_tokens['Question_Vectors'][i] = question_vectors
            # Average Pooling of all tokens
            data_tokens['Average_Pooling'][i] = list(pd.DataFrame(question_vectors).mean())

In [None]:
data_tokens['Question_Tokens'] = [" ".join(l) for l in data_tokens['Question_Tokens']]
length = data_tokens['Question_Tokens'].apply(len)
data_tokens = data_tokens.assign(Question_Length=length)
data_tokens.head()

In [None]:
# Export as data as JSON
data_json = json.loads(data_tokens.to_json(orient='records'))

with open(path + 'StackOverflow_Word2Vec.json', 'w') as outfile:
    json.dump(data_json, outfile)

## Word2Vec

In [None]:
try:
    stackoverflow_path = path + 'StackOverflow_Word2Vec.json'

    with open(stackoverflow_path) as file:
        reader = json.load(file)

        classes = []
        questions = []
        questions_tokens = []
        answers = []
        question_lengths = []
        question_vectors = []
        average_pooling = []
        for row in reader:
            classes.append(row['Class'])
            questions.append(row['Question'])
            questions_tokens.append(row['Question_Tokens'].split())
            answers.append(row['Answer'])
            question_lengths.append(row['Question_Length'])
            question_vectors.append(row['Question_Vectors'])
            average_pooling.append(row['Average_Pooling'])

        data_tokens = pd.DataFrame({'Class': classes,
                                    'Question': questions,
                                    'Question_Tokens': questions_tokens,
                                    'Answer': answers,
                                    'Question_Length': question_lengths,
                                    'Question_Vectors': question_vectors,
                                    'Average_Pooling': average_pooling})
except:
    pass

data_tokens.head()

In [None]:
# Greeting function
GREETING_INPUTS = ("hello", "hi", "greetings", "hello i need help", "good day","hey","i need help", "greetings")
GREETING_RESPONSES = ["Good day, How may i of help?", "Hello, How can i help?", "hello", "I am glad! You are talking to me."]
           
def greeting(sentence):
    for word in sentence.split():
        if word.lower() in GREETING_INPUTS:
            return random.choice(GREETING_RESPONSES)

In [None]:
def Talk_To_Javris(data_language, model):
    
    # Preprocessing of user input
    sentence_pp = pre_process(pd.Series(sentence)) 

    cosines = []
    try:
        # Get vectors and average pooling
        question_vectors = []
        for token in sentence_pp:
            try:
                vector = model[token]
                question_vectors.append(vector)
            except:
                continue
        question_ap = list(pd.DataFrame(question_vectors[0]).mean())

        # Calculate cosine similarity
        for t in data_language['Average_Pooling']:
            if t is not None and len(t) == len(question_ap):
                val = cosine_similarity([question_ap], [t])
                cosines.append(val[0][0])
            else:
                cosines.append(0)
    except:
        pass
            
    # If not in the topic trained
    if len(cosines) == 0:
        not_understood = "Apology, I do not understand. Can you rephrase?"
        return not_understood, 999
    
    else: 
        # Sort similarity
        index_s =[]
        score_s = []
        for i in range(len(cosines)):
            x = cosines[i]
            if x >= 0.9:
                index_s.append(i)
                score_s.append(cosines[i])

        reply_indexes = pd.DataFrame({'index': index_s, 'score': score_s})
        reply_indexes = reply_indexes.sort_values(by="score" , ascending=False)

        # Find Top Questions and Score
        r_index = int(reply_indexes['index'].iloc[0])
        r_score = float(reply_indexes['score'].iloc[0])

        reply = str(data_language.iloc[:,0][r_index])
        
        return reply, r_score

In [None]:
flag_language = True
flag_query = True
dict_language = {'0': 'python', '1': 'c++', '2': 'c#', '3': 'java', '4': 'ios', '5': 'android', '6': 'html', 
                 '7': 'jquery', '8': 'php', '9': 'javascript'}

print('......................................................................................')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'My name is Jarvis, a Programming Language Apprentice Bot.')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'I will try my best to answer your query.')
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'If you want to exit, you can type < bye >.')

while(flag_language == True):
    print("......................................................................................")
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'Please select which language you want to enquire, ' +
      'you can type:')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 0 > for python     < 1 > for c++      < 2 > for c#')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 3 > for java       < 4 > for ios      < 5 > for android')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 6 > for html       < 7 > for jquery   < 8 > for php')
    print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + '< 9 > for javascript')
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")
    
    if(sentence.lower() != 'bye'):
        if (sentence.lower() in list(dict_language.keys())):
            language = dict_language[sentence.lower()]
            data_language = data_tokens[data_tokens['Class'] == language]
            data_language = pd.DataFrame({'Question': list(data_language['Question']),
                                          'Question_Tokens': list(data_language['Question_Tokens']),
                                          'Answer': list(data_language['Answer']),
                                          'Class': list(data_language['Class']),
                                          'Question_Vectors': list(data_language['Question_Vectors']),
                                          'Average_Pooling': list(data_language['Average_Pooling'])
                                         })
            
            # Read word2vec model
            word2vec_pickle_path = path + 'stackoverflow_word2vec_' + language + '.bin'
            model = gensim.models.KeyedVectors.load(word2vec_pickle_path)
            
            flag_language = False
            flag_query = True
    else:
        flag_language = False
        flag_query = False

print("......................................................................................")
print('\x1b[1;37;40m' + 'Jarvis' + '\x1b[0m' + ': ' + 'Let''s start! Please input your question now.')
    
while(flag_query == True):
    print("......................................................................................")
    sentence = input('\x1b[0;30;47m' + 'USER  ' + '\x1b[0m' + ':')
    print("......................................................................................")

    if(sentence.lower() != 'bye'):
        if(greeting(sentence.lower()) != None):
            print('\x1b[1;37;40m' + 'JARVIS' + '\x1b[0m' + ': ' + greeting(sentence.lower()))
        else:
            reply, score = Talk_To_Javris(data_language, model)
            print('\x1b[1;37;40m' + 'JARVIS'+'\x1b[0m'+': '+reply)

            #For Tracing, comment to remove from print 
            #print("")
            #print("SCORE: " + str(score))
    else:
        flag_query = False
print('\x1b[1;37;40m' + 'JARVIS' + '\x1b[0m' + ': ' + 'Bye! Hope that i am of help.') 