<h1>Globals</h1>

In [9]:
from __future__ import print_function
from datetime import datetime
from git import Repo, Git
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time
import os.path
import re, string, ntpath, keyword, json, codecs
import threading
import shutil, errno


skip_projects = [ "androidannotations",
                  "bigbluebutton",
                  "cassandra",
                  "elasticsearch",
                  "hibernate-orm",
                  "liferay-portal",
                  "netty",
                  "platform_frameworks_base",
                  "spring-framework",
                  "wildfly"]
tag_names = ["2016-06","2016-01","2015-06","2015-01","2014-06","2014-01","2013-06","2013-01",
"2012-06","2012-01","2011-06","2011-01","2010-06","2010-01","2009-06","2009-01",
"2008-06","2008-01","2007-06","2007-01","2006-06","2006-01","2005-06","2005-01",
"2004-06","2004-01","2003-06","2003-01"]

project_type_map = {"java" : ".java",
                    "python" : ".py",
                    "ruby" : ".rb",
                    "php" : ".php",
                    "javascript" : ".js"}

def load_config(config_file):
    """
    Load projects configuration file.
    """
    with open(config_file) as data_file:    
        config_data = json.load(data_file)
    return config_data

def copy_folder(src, dst):
    try:
        if os.path.exists(dst):
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    except OSError as exc: # python >2.5
        if exc.errno == errno.ENOTDIR:
            shutil.copy(src, dst)
        else: raise

base_dir = "/home/hshahin/workspaces/Spring2016_SE_Project"
data_dir = os.path.join(base_dir , "data")
config_file = "projects_config.json"
config_data = load_config(os.path.join(base_dir , config_file))

<h1>Creating tags functions</h1>

In [10]:
def get_date_time(epoch):
    '''
    convert epoch to date_time
    '''
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch))

def tag_exists(path, tag_name):
    repo = Repo(path)
    
    return True if tag_name in repo.tags else False

def get_epoch(year, month='01'):
    """
    calculate the epoch of first day of a year-month
    """
    pattern = '%Y.%m.%d %H:%M:%S'
    return int(time.mktime(time.strptime(str(year) + '.' + str(month) + '.01 00:00:00', pattern)))


def create_tags(path):
    '''
    takes repo path and creates tags for first commit in Jan and Jun. for every year
    # get the list of commits
    # get the latest commit date
    # current_year is the year from that date
    # loop through the list of commit to find the commit having a date equal or just after 1/1/current_year
    # once found create a tage with the current_year name on it AND
    # subtract 1 from the year and continue.

    '''
    repo = Repo(path)

    # get the list of commits
    commits = list(repo.iter_commits())

    # get the latest commit date, current_year is the year from that date
    current_year = datetime.fromtimestamp(commits[0].committed_date).year


    for idx, commit in enumerate(commits):
        # time.sleep(2)
        # print(commits[idx].hexsha)

        current_year_01 = str(current_year)+'-01'
        current_year_06 = str(current_year)+'-06'

        try:
            if get_epoch(current_year, '01') > commit.committed_date and \
                    int(time.time()) > get_epoch(current_year, '01')  and \
                    idx !=0:
                if str(current_year_01) not in repo.tags and idx != 0:
                    print(commits[idx-1].hexsha+' '+get_date_time(commits[idx-1].committed_date)+' '+current_year_01)
                    past = repo.create_tag(current_year_01, ref=commits[idx-1],
                                      message="This is a tag to mark the first commit in year %s" % current_year_01)
                current_year = datetime.fromtimestamp(commit.committed_date).year

            if get_epoch(current_year, '06') > commit.committed_date and \
                    int(time.time()) > get_epoch(current_year, '06') and \
                idx != 0:
                if str(current_year_06) not in repo.tags:
                    print(commits[idx-1].hexsha+' '+get_date_time(commits[idx-1].committed_date)+' '+current_year_06)
                    past = repo.create_tag(current_year_06, ref=commits[idx-1],
                                      message="This is a tag to mark the first commit in year %s" % current_year_06)
        except AttributeError:
            pass

def checkout_tag(path, tag_name):
    '''
    checks out a tag if it exists
    '''
    repo = Repo(path)
    git = Git(path)
    if tag_name in repo.tags:
        git.checkout(tag_name)

def delete_tags(path):
    '''
    remove all tags in a given repo
    '''

    repo = Repo(path)

    for tag in repo.tags:
        repo.delete_tag(tag)

<h1>Create tags every 6 months for each repo</h1>

In [11]:
# for project_name, project_type in config_data.items():
#     print("Processing project: " + project_name )
#     t0 = time.time()
#     delete_tags(os.path.join(data_dir, project_name))
#     create_tags(os.path.join(data_dir, project_name))
#     print("Project: " + project_name + " taged in %0.3fs." % (time.time() - t0))

<h1>Preprocesing functions</h1>

In [12]:
# Python keywords
python_keywords = keyword.kwlist

# Java keywords from https://docs.oracle.com/javase/tutorial/java/nutsandbolts/_keywords.html
java_keywords = ["abstract","continue","for","new","switch","assert","default","goto","package","synchronized",
                 "boolean","do","if","private","this","break","double","implements","protected","throw",
                 "byte","else","import","public","throws","case","enum","instanceof","return","transient","catch",
                 "extends","int","short","try","char","final","interface","static","void","class","finally","long",
                 "strictfp","volatile","const","float","native","super","while"]

# Ruby keywords from http://docs.ruby-lang.org/en/2.2.0/keywords_rdoc.html
ruby_keywords = ["__ENCODING__","__LINE__","__FILE__","BEGIN","END","alias","and","begin","break",
                 "case","class","def","defined?","do","else","elsif","end","ensure","false","for","if",
                 "in","module","next","nil","not","or","redo","rescue","retry","return","self","super",
                 "then","true","undef","unless","until","when","while","yield"]

# PHP keywords form http://php.net/manual/en/reserved.keywords.php
php_keywords = ["__halt_compiler","abstract","and","array","as","break","callable","case","catch","class","clone",
                "const","continue","declare","default","die","do","echo","else","elseif","empty","enddeclare",
                "endfor","endforeach","endif","endswitch","endwhile","eval","exit","extends","final","finally",
                "for","foreach","function","global","goto","if","implements","include","include_once","instanceof",
                "insteadof","interface","isset","list","namespace","new","or","print","private","protected",
                "public","require","require_once","return","static","switch","throw","trait","try","unset","use",
                "var","while","xor","yield"]

all_keywords = python_keywords + java_keywords + ruby_keywords + php_keywords


def path_leaf(path):
    head, tail = ntpath.split(path)
    return head, tail

# split camel case tokens
_underscorer1 = re.compile(r'(.)([A-Z][a-z]+)')
_underscorer2 = re.compile('([a-z0-9])([A-Z])')


def camel_to_spaces(s):
    """
    convert camel case into spaces seperated
    """
    subbed = _underscorer1.sub(r'\1 \2', s)
    return _underscorer2.sub(r'\1 \2', subbed).lower()

def snake_to_spaces(snake_cased_str):
    """
    convert snake case into spaces seperated
    """
    separator = "_"
    components = snake_cased_str.split(separator)
    if components[0] == "":
      components = components[1:]
    if components[-1] == "":
      components = components[:-1]
    if len(components) > 1:
      spaced_str = components[0].lower()
      for x in components[1:]:
        spaced_str += " " + x.lower()
    else:
      spaced_str = components[0]
    return spaced_str


def file_preprocessing(input_file, output_file):
    """
    - replace punctuations with spaces
    - stemming
    - camel to spaces and snake to spaces
    - remove language spesific keywords
    - write the entire project snapshot into one file under project root folder
    """
    # print("processing file " + input_file)
    # replace the punctuations with space
    replace_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    # stemming
    stemmer = PorterStemmer()

    with open(input_file, 'r', encoding='utf-8', errors='replace') as inFile, open(output_file,'w') as outFile:
        for line in inFile:
            # replace punctuations
            # convert camel case into space separated
            # convert snake case into space separated             
            # remove language keywords
            line_witout_puncs = ' '.join([snake_to_spaces(camel_to_spaces(word)) 
                                          for word in line.translate(replace_punctuation).split()
                                          if len(word) >=4 and word not in stopwords.words('english') 
                                          and word not in all_keywords])

            # stemming
            singles = []
            for plural in line_witout_puncs.split():
                try:
                    singles.append(stemmer.stem(plural))
                except UnicodeDecodeError:
                    print(plural) 

            line_stemmed = ' '.join(singles)
            print(line_stemmed, file=outFile)


def return_file_type(project_path, file_type):
    project_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(project_path)
             for name in files
             if name.endswith((file_type))]
    return project_files[:5]

def project_preprocessing(project_path, file_type, tag_name):
    # print ("processing project "+ project_path)
    # process project source code files and save each file as .*.proc 
    project_files = return_file_type (project_path, file_type)
    for source_file in project_files:
        head, tail = path_leaf(source_file)
        proc_file = os.path.join(head , tail + '.proc')
        file_preprocessing(source_file, proc_file)
        
    # concatenate all processed project files into one file under root directory
    project_proc_files = return_file_type (project_path, file_type + '.proc')
    with open(os.path.join(project_path , tag_name + "-concatenated.out"), 'w') as outfile:
        for fname in project_proc_files:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)

<h1>checkout tags in separate folders</h1>

In [19]:
# create folder project_tags
# for each tag if tag exists
# copy the project into project_tag/tag_name
# checkout ptoject to tag_name
# delete .git folder

def checkout_projects():
    for project_name, project_type in config_data.items():
        project_path = os.path.join(data_dir, project_name)
        project_tags_path = project_path + '-tags'
        
        if not os.path.exists(project_tags_path):
            os.makedirs(project_tags_path)
            
        repo = Repo(project_path)
        for tag_name in tag_names:
            if tag_exists(project_path, tag_name):
                print(project_name+' '+tag_name)
                current_tag_path = os.path.join(project_tags_path, tag_name)
                copy_folder(project_path, current_tag_path)
                checkout_tag(current_tag_path, tag_name)
                os.chdir(current_tag_path)
                shutil.rmtree(os.path.join(current_tag_path, '.git'))


checkout_projects()

django 2016-01
django 2015-06
django 2015-01
django 2014-06
django 2014-01


GitCommandError: 'git checkout 2014-01' returned with exit code 1
stderr: 'error: Your local changes to the following files would be overwritten by checkout:
	docs/_theme/djangodocs-epub/static/docicons-behindscenes.png
	docs/_theme/djangodocs-epub/static/docicons-note.png
	docs/_theme/djangodocs-epub/static/docicons-philosophy.png
	docs/_theme/djangodocs-epub/static/docicons-warning.png
Please, commit your changes or stash them before you can switch branches.
Aborting'

<h1>Run preprocessing</h1>

In [5]:
def run_preprocessing(project_path, project_name, tag_name):
#     print("run_preprocessing: " + tag_name)
#     print("run_preprocessing: " + project_path)
    if tag_exists(project_path, tag_name):
        # try: 
        # check out year-month tag
        # checkout_tag(project_path, tag_name)
        #         print("checkout project: " + project_name + "\t\t tag " + tag_name )
        t0 = time.time()
        # processing project if tag exists
        project_preprocessing(project_path, project_type_map[str(project_type)], tag_name)
        print("processing project: " + project_name + "\t\t tag " 
              + tag_name + " done in %0.3fs." % (time.time() - t0))
        #         except:
        #             print("Error happened in project: " + project_name + "\t\t tag " + tag_name )
        #             pass

current_tag = '2016-01'
for project_name, project_type in config_data.items():
    project_path = os.path.join(data_dir, project_name)

    preprocessing_threads = threading.Thread(target=run_preprocessing, 
                                             args=(project_path, project_name, current_tag))
    preprocessing_threads.start()

run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/padrino-framework
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/liferay-portal
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/CodeIgniter
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/django
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/pakyow
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/bigbluebutton
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/spring-framework
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/laravel
run_preprocessing: 2016-01
run_preprocessing: /home/hshahin/workspaces/Spring2016_SE_Project/data/ninja
run_preprocess

<h1>Topic Modeling</h1>

In [72]:
from __future__ import print_function
import os
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pandas as pd

n_features = 10000000
n_topics = 20
# n_top_words = 100


config_file = "projects_config_webFrameworks.json"
# config_file = 'projects_config_topGithub.json'
selected_projects = load_config(os.path.join(base_dir , config_file))


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

def get_top_words(model, feature_names, n_top_words):
    df = pd.DataFrame(columns=['word'+str(i) for i in range(n_top_words)])
    for topic_idx, topic in enumerate(model.components_):
        df.loc['topic#'+str(topic_idx)] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        df.loc['freq#'+str(topic_idx)] = [topic[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    
    return df

def get_top_doc_topic(projects_topics, project_names, n_top_words):
    df = pd.DataFrame(columns=['word'+str(i) for i in range(n_top_words)])
    for topic_idx, topic in enumerate(model.components_):
        df.loc['topic#'+str(topic_idx)] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        df.loc['freq#'+str(topic_idx)] = [topic[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    
    return df

def print_full(x):
    pd.set_option('display.max_colwidth', 1000)
    print(x)
    pd.reset_option('display.max_rows')

# Read projects into strings
# processed_path = []
projects_data = [''] * len(selected_projects)
for i, project_name in enumerate(selected_projects.keys()):
    processed_path = os.path.join(data_dir, project_name, "concatenated.out")
    with open(processed_path, 'r') as myfile:
        projects_data[i] = myfile.read().replace('\n', ' ')

print('num of projects: ', len(projects_data))

   
# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.8, min_df=0.2, max_features=n_features, stop_words='english')

t0 = time()
tf = tf_vectorizer.fit_transform(projects_data)
print("done in %0.3fs." % (time() - t0))

print('tf shape:', tf.shape)

print("Fitting LDA models with tf features")
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=100,
                                learning_method='online', learning_offset=10.,
                                random_state=0, n_jobs=4)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))


num of projects:  22
Extracting tf features for LDA...
done in 1.629s.
tf shape: (22, 1598)
Fitting LDA models with tf features
done in 10.604s.


## 1- Topic-word

In [71]:
print("\nTopics-words in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
get_top_words(lda, tf_feature_names, 20)



Topics-words in LDA model:


Unnamed: 0,word0,word1,word2,word3,word4,word5,word6,word7,word8,word9,word10,word11,word12,word13,word14,word15,word16,word17,word18,word19
topic#0,column,command,schema,prototyp,children,middlewar,migrat,strict,compos,constructor,relationship,export,task,sync,alter,adapt,flag,offset,obj,drop
freq#0,789.712,287.725,271.509,176.858,166.602,151.531,149.375,144.263,140.274,139.569,139.072,111.291,107.198,106.801,104.254,96.938,92.9824,91.0365,90.5351,88.856
topic#1,assert,licens,choic,softwar,input,impl,mock,articl,true,apach,distribut,label,fals,standalon,languag,permiss,column,widget,constant,java
freq#1,0.115371,0.0917318,0.0665962,0.065389,0.0644337,0.0640868,0.0630121,0.0623615,0.0622206,0.0615037,0.0610752,0.0610359,0.0608229,0.0603542,0.0600196,0.0595328,0.05923,0.0591795,0.0590991,0.0584603
topic#2,servic,softwar,node,assert,oper,licens,copyright,deploy,plugin,ident,public,record,label,builder,contributor,var,input,languag,warranti,free
freq#2,0.0725049,0.0666412,0.0633478,0.0620124,0.0609478,0.0596735,0.0595848,0.0585365,0.0578881,0.0574071,0.0572902,0.0571437,0.0570751,0.0566868,0.0566846,0.0564359,0.0561919,0.0560407,0.0560086,0.0554114
topic#3,packag,servic,node,releas,licens,deploy,plugin,project,softwar,oper,builder,publish,scope,input,assert,dir,free,platform,asset,site
freq#3,0.0618733,0.0615614,0.0597704,0.0570341,0.0566994,0.0561992,0.0556783,0.0553324,0.0550835,0.0549528,0.0547819,0.0545687,0.0540394,0.053904,0.0539021,0.0537346,0.0537247,0.0533824,0.053366,0.0533294
topic#4,licens,assert,mock,impl,apach,distribut,standalon,languag,articl,java,builder,flash,verifi,constant,mail,true,copyright,googl,matcher,condit
freq#4,2012.5,1305.91,527.302,485.644,440.681,386.738,373.415,349.639,337.767,331.951,294.843,288.891,268.973,261.728,242.445,238.857,229.498,214.808,210.878,200.048


In [16]:
# processed_path = []
# for project_name, project_type in config_data.items():
#     processed_path.append(os.path.join(data_dir, project_name, "concateneated.out"))

In [7]:
# print(processed_path)

# 2- Project-topic

In [74]:
projects_topics = lda.transform(tf)

In [82]:
# get_top_words(projects_topics, tf_feature_names, 20)
projects_topics[0]

array([  5.00000012e-02,   5.00000000e-02,   5.00000000e-02,
         5.00000000e-02,   5.00000012e-02,   5.00000012e-02,
         5.00000013e-02,   5.00000013e-02,   5.00000012e-02,
         5.00000012e-02,   5.00000000e-02,   5.00000000e-02,
         5.00000013e-02,   5.00000011e-02,   5.00000012e-02,
         4.29040500e+04,   5.00000014e-02,   5.00000012e-02,
         5.00000012e-02,   5.00000012e-02])