# Biblioteki i directory

In [2]:
# Libraries and dir
import os
from sqlalchemy import create_engine
import pandas as pd
import gensim
import gensim.downloader as api
from gensim.models.word2vec import Word2Vec
from gensim.models import Word2Vec
import json
from nltk.tokenize import sent_tokenize, word_tokenize 
import warnings 
import numpy as np
import spacy
import pickle

warnings.filterwarnings(action = 'ignore')   
os.chdir(r"C:\Users\jaroslaw.bontruk\Documents\Repos\ITM.Internal.AdvancedSourcing")

# Dane z bazy danych

In [None]:
# Get data from database
db_connection_str = 'mysql+pymysql://###/stackoverflow'
db_connection = create_engine(db_connection_str)

tags = pd.read_sql('SELECT * FROM tags', con=db_connection)
tags.head()

# Modele pre-trenowane

In [None]:
# Model Google 300
model_google_300 = gensim.models.KeyedVectors.load_word2vec_format(
    './model/GoogleNews-vectors-negative300.bin', 
    binary=True)
#model_google_300.word_vec("language")

In [None]:
# Model Text8
corpus_text8 = api.load('text8')
model_text8 = Word2Vec(corpus_text8)

In [None]:
# Example results
print(model_google_300.similarity('frog', 'lizard'))
print(model_google_300.similarity('javascript', 'java'))
print(model_google_300.similarity('javascript', '.net'))
print(model_google_300.similarity('javascript', 'python'))

print(model_google_300.most_similar('javascript'))
print(model_google_300.most_similar('java'))
print(model_google_300.most_similar('python'))
print(model_google_300.most_similar('php'))
print(model_google_300.most_similar('c#'))

print(model_text8.most_similar('javascript'))
print(model_text8.most_similar('java'))
print(model_text8.most_similar('python'))
print(model_text8.most_similar('php'))
print(model_text8.most_similar('c#'))

In [None]:
# List of pre-trained models
info = api.info()
print(json.dumps(info, indent=4))

# Trening na Post'ach bez czyszczenia Spacy

In [3]:
# Get PostsBody
so_corpus = pd.read_csv('./data/corpus.csv')
so_corpus_list = so_corpus['body'].tolist()
print(type(so_corpus_list))
print(so_corpus_list[652235])

<class 'list'>
I have multiple Xcode projects inside a workspace. Each project consists of static library target and test target. All test targets perform when I select single project.Question: It is possible to create a single project, that runs all the tests in other projects?


In [None]:
# Prepare input for own model (without cleaning)
so_data = []

for i in range(0,len(so_corpus_list)):
    k = so_corpus_list[i]
    temp = []
    # tokenize the sentence into words 
    for j in word_tokenize(k): 
        temp.append(j.lower()) 
    so_data.append(temp) 

In [None]:
# Train my own model on tokenized data
# Create CBOW model 
model_so1 = gensim.models.Word2Vec(so_data, min_count = 10, size = 100, workers = 4, window = 5)  
# Create Skip Gram model 
model_so2 = gensim.models.Word2Vec(so_data, min_count = 10, size = 100, workers = 4, window = 5, sg = 1) 

In [None]:
# Save models
pickle.dump(model_so1, open('model_so1.model', 'wb'))
pickle.dump(model_so2, open('model_so2.model', 'wb'))

In [None]:
# Example results MY CBOW MODEL
print(model_so1.similarity('javascript', 'java'))
print(model_so1.similarity('javascript', '.net'))
print(model_so1.similarity('javascript', 'python'))

print(model_so1.most_similar('javascript'))
print(model_so1.most_similar('java'))
print(model_so1.most_similar('python'))
print(model_so1.most_similar('php'))
print(model_so1.most_similar('c#'))

In [None]:
# Example results MY SKIP GRAM MODEL
print(model_so2.similarity('javascript', 'java'))
print(model_so2.similarity('javascript', '.net'))
print(model_so2.similarity('javascript', 'python'))

print(model_so2.most_similar('javascript'))
print(model_so2.most_similar('java'))
print(model_so2.most_similar('python'))
print(model_so2.most_similar('php'))
print(model_so2.most_similar('c#'))

# Trening na Post'ach z czyszczeniem Spacy

In [None]:
# Get PostsBody with string conversion (for no errors with Spacy)
so_corpus = pd.read_csv('./data/corpus.csv')
so_corpus['body'] = so_corpus['body'].astype(str)
print(so_corpus["body"].dtype)

In [None]:
# Clean the corpus
nlp = spacy.load("en_core_web_sm")

def spacy_function(text):
    text = nlp(text)
    attribs = ['orth_', 'lemma_', 'tag_', 'pos_', 'dep_', 'head']
    table = [{att:tok.__getattribute__(att) for att in attribs} for tok in text]
    df = pd.DataFrame(table)
    df['lemma_'] = df['lemma_'].str.lower()
    #df = df[~df['lemma_'].isin(["-pron-"])]
    df = df[df['pos_'].isin(["ADJ", "ADV", "INTJ", "NOUN", "PROPN", "VERB"])] 
    df = df["lemma_"].tolist()
    df = " ".join(df)
    
    return df

so_corpus["New"] = so_corpus["body"].apply(lambda x: spacy_function(x))

In [None]:
# Convert cleaned corpus
so_corpus_list_cleaned = so_corpus['New'].tolist()
print(so_corpus_list_cleaned[623456])

In [None]:
# Prepare cleaned input for own model
so_data_cleaned = []

for i in range(0,len(so_corpus_list_cleaned)):
    k = so_corpus_list_cleaned[i]
    temp = []
    # tokenize the sentence into words 
    for j in word_tokenize(k): 
        temp.append(j.lower()) 
    so_data_cleaned.append(temp) 

In [None]:
# Train my own model on tokenized data
# Create CBOW model 
model_cleaned_so1 = gensim.models.Word2Vec(so_data_cleaned, min_count = 10, size = 100, workers = 4, window = 5)  
# Create Skip Gram model 
model_cleaned_so2 = gensim.models.Word2Vec(so_data_cleaned, min_count = 10, size = 100, workers = 4, window = 5, sg = 1) 

In [None]:
# Save models
pickle.dump(model_cleaned_so1, open('model_cleaned_so1.model', 'wb'))
pickle.dump(model_cleaned_so2, open('model_cleaned_so2.model', 'wb'))

In [None]:
# Example results MY CBOW MODEL after data cleaning
print(model_cleaned_so1.similarity('javascript', 'java'))
print(model_cleaned_so1.similarity('javascript', '.net'))
print(model_cleaned_so1.similarity('javascript', 'python'))

print(model_cleaned_so1.most_similar('javascript'))
print(model_cleaned_so1.most_similar('java'))
print(model_cleaned_so1.most_similar('python'))
print(model_cleaned_so1.most_similar('php'))
print(model_cleaned_so1.most_similar('c#'))

In [None]:
# Example results MY SKIP GRAM MODEL data cleaning
print(model_cleaned_so2.similarity('sql', 'mysql'))
print(model_cleaned_so2.similarity('javascript', 'java'))
print(model_cleaned_so2.similarity('javascript', '.net'))
print(model_cleaned_so2.similarity('javascript', 'python'))

print(model_cleaned_so2.most_similar('javascript'))
print(model_cleaned_so2.most_similar('java'))
print(model_cleaned_so2.most_similar('python'))
print(model_cleaned_so2.most_similar('php'))
print(model_cleaned_so2.most_similar('c#'))

In [None]:
print(model_cleaned_so2.most_similar('r'))
print(model_cleaned_so2.most_similar('c'))

# Trening na łańcuchach tagów

In [None]:
# Get Tags Chains
tc_corpus = pd.read_csv('./data/tags_chains.csv')
tc_corpus['tags']= tc_corpus['tags'].astype(str)
print(tc_corpus['tags'].dtype)
# Convert corpus to list
tc_corpus_list = tc_corpus['tags'].tolist()
print(so_corpus_list_cleaned[123456])

In [None]:
# Prepare cleaned input for own model
tc_data = []

for i in range(0,len(tc_corpus_list)):
    k = tc_corpus_list[i]
    temp = []
    # tokenize the sentence into words 
    for j in word_tokenize(k): 
        temp.append(j.lower()) 
    tc_data.append(temp) 

In [None]:
tc_data_cleaned.head()

In [None]:
# Train my own model on tokenized data
# Create CBOW model 
model_tc1 = gensim.models.Word2Vec(tc_data, min_count = 10, size = 100, workers = 4, window = 5)  
# Create Skip Gram model 
model_tc2 = gensim.models.Word2Vec(tc_data, min_count = 10, size = 100, workers = 4, window = 5, sg = 1)
# Save models
pickle.dump(model_tc1, open('model_tc1.model', 'wb'))
pickle.dump(model_tc2, open('model_tc2.model', 'wb'))

# Testowanie modeli

In [None]:
groups = ['java', 'c', 'python', 'c++', '.net', 'javascript', 'php', 'swift',
         'sql', 'ruby', 'delphi', 'go', 'd', 'r', 'perl', 'matlab']
not_found = ['c#', 'visual_basic', 'object_pascal', 'objective-c', 'assembly_language']
tags = ['html', 'python', 'r', 'css', 'regex', 'mysql', 'angular', 'django', 'node', 'node.js', 'nodejs']

In [None]:
mapa  = pd.DataFrame(columns = ['group', 'tag', 'similarity_score'])

for i in range(1, len(tags)):
    for j in range(1, len(groups)):
        mapa.loc[len(mapa)] = [groups[j], tags[i], model_cleaned_so2.similarity(tags[i], groups[j])]

In [None]:
mapa = mapa.sort_values(by = ['similarity_score'], ascending = False)
mapa.to_csv('mapa4.csv', sep=',')