<h1>Preprocesing</h1>

In [3]:
from __future__ import print_function
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from time import time
import os.path
import re, string, ntpath, keyword, json, codecs

skip_projects = [ "androidannotations",
                  "bigbluebutton",
                  "cassandra",
                  "elasticsearch",
                  "hibernate-orm",
                  "liferay-portal",
                  "netty",
                  "platform_frameworks_base",
                  "spring-framework",
                  "wildfly"]

project_type_map = {"java" : ".java",
                    "python" : ".py",
                    "ruby" : ".rb",
                    "php" : ".php",
                    "javascript" : ".js"}

def load_config(config_file):
    """
    Load projects configuration file.
    """
    with open(config_file) as data_file:    
        config_data = json.load(data_file)
    return config_data

base_dir = "/home/hshahin/workspaces/Spring2016_SE_Project"
data_dir = os.path.join(base_dir , "data")
config_file = "projects_config.json"
config_data = load_config(os.path.join(base_dir , config_file))

project_path = '/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations'

# Python keywords
python_keywords = keyword.kwlist

# Java keywords from https://docs.oracle.com/javase/tutorial/java/nutsandbolts/_keywords.html
java_keywords = ["abstract","continue","for","new","switch","assert","default","goto","package","synchronized",
                 "boolean","do","if","private","this","break","double","implements","protected","throw",
                 "byte","else","import","public","throws","case","enum","instanceof","return","transient","catch",
                 "extends","int","short","try","char","final","interface","static","void","class","finally","long",
                 "strictfp","volatile","const","float","native","super","while"]

# Ruby keywords from http://docs.ruby-lang.org/en/2.2.0/keywords_rdoc.html
ruby_keywords = ["__ENCODING__","__LINE__","__FILE__","BEGIN","END","alias","and","begin","break",
                 "case","class","def","defined?","do","else","elsif","end","ensure","false","for","if",
                 "in","module","next","nil","not","or","redo","rescue","retry","return","self","super",
                 "then","true","undef","unless","until","when","while","yield"]

# PHP keywords form http://php.net/manual/en/reserved.keywords.php
php_keywords = ["__halt_compiler","abstract","and","array","as","break","callable","case","catch","class","clone",
                "const","continue","declare","default","die","do","echo","else","elseif","empty","enddeclare",
                "endfor","endforeach","endif","endswitch","endwhile","eval","exit","extends","final","finally",
                "for","foreach","function","global","goto","if","implements","include","include_once","instanceof",
                "insteadof","interface","isset","list","namespace","new","or","print","private","protected",
                "public","require","require_once","return","static","switch","throw","trait","try","unset","use",
                "var","while","xor","yield"]

all_keywords = python_keywords + java_keywords + ruby_keywords + php_keywords


def path_leaf(path):
    head, tail = ntpath.split(path)
    return head, tail

# split camel case tokens
_underscorer1 = re.compile(r'(.)([A-Z][a-z]+)')
_underscorer2 = re.compile('([a-z0-9])([A-Z])')


def camel_to_spaces(s):
    """
    convert camel case into spaces seperated
    """
    subbed = _underscorer1.sub(r'\1 \2', s)
    return _underscorer2.sub(r'\1 \2', subbed).lower()

def snake_to_spaces(snake_cased_str):
    """
    convert snake case into spaces seperated
    """
    separator = "_"
    components = snake_cased_str.split(separator)
    if components[0] == "":
      components = components[1:]
    if components[-1] == "":
      components = components[:-1]
    if len(components) > 1:
      spaced_str = components[0].lower()
      for x in components[1:]:
        spaced_str += " " + x.lower()
    else:
      spaced_str = components[0]
    return spaced_str


def file_preprocessing(input_file, output_file):
    """
    - replace punctuations with spaces
    - stemming
    - camel to spaces and snake to spaces
    - remove language spesific keywords
    - write the entire project snapshot into one file under project root folder
    """
    # print("processing file " + input_file)
    # replace the punctuations with space
    replace_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    # stemming
    stemmer = PorterStemmer()

    with open(input_file, 'r') as inFile, open(output_file,'w') as outFile:
        for line in inFile:
            # replace punctuations
            # convert camel case into space separated
            # convert snake case into space separated             
            # remove language keywords
            line_witout_puncs = ' '.join([snake_to_spaces(camel_to_spaces(word)) for word in line.translate(replace_punctuation).split()
                  if len(word) >=4 and word not in stopwords.words('english') and word not in all_keywords])

            # stemming
            singles = []
            for plural in line_witout_puncs.split():
                try:
                    singles.append(stemmer.stem(plural))
                except UnicodeDecodeError:
                    print(plural) 

            line_stemmed = ' '.join(singles)
            print(line_stemmed, file=outFile)


def return_file_type(project_path, file_type):
    project_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(project_path)
             for name in files
             if name.endswith((file_type))]
    return project_files

def project_preprocessing(project_path, file_type):
    # print ("processing project "+ project_path)
    # process project source code files and save each file as .*.proc 
    project_files = return_file_type (project_path, file_type)
    for source_file in project_files:
        head, tail = path_leaf(source_file)
        proc_file = os.path.join(head , tail + '.proc')
        file_preprocessing(source_file, proc_file)
        
    # concatenate all processed project files into one file under root directory
    project_proc_files = return_file_type (project_path, file_type + '.proc')
    with open(os.path.join(project_path , "concatenated.out"), 'w') as outfile:
        for fname in project_proc_files:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)

In [12]:
for project_name, project_type in config_data.items():
    t0 = time()
    project_preprocessing(os.path.join(data_dir, project_name), project_type_map[str(project_type)])
    print("processing project: " + project_name + " done in %0.3fs." % (time() - t0))

processing project: spring-framework done in 490.810s.
processing project: elasticsearch done in 447.135s.
processing project: androidannotations done in 32.293s.
processing project: bigbluebutton done in 60.167s.
processing project: wildfly done in 425.471s.
processing project: netty done in 150.807s.
processing project: liferay-portal done in 2196.920s.
processing project: platform_frameworks_base done in 976.352s.
processing project: cassandra done in 240.609s.
processing project: hibernate-orm done in 371.642s.


<h1>Topic Modeling</h1>

In [4]:
from __future__ import print_function
import os
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_features = 1000
n_topics = 10
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

processed_path = []
for project_name, project_type in config_data.items():
    processed_path.append(os.path.join(data_dir, project_name, "concateneated.out"))

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=0.05, max_features=n_features, stop_words='english')

t0 = time()
tf = tf_vectorizer.fit_transform(processed_path)
print("done in %0.3fs." % (time() - t0))

print("Fitting LDA models with tf features")
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf features for LDA...
done in 0.003s.
Fitting LDA models with tf features
done in 0.021s.

Topics in LDA model:
Topic #0:
androidannotations cassandra portal liferay orm hibernate bigbluebutton platform_frameworks_base netty framework spring elasticsearch wildfly
Topic #1:
bigbluebutton elasticsearch framework netty orm androidannotations wildfly spring hibernate platform_frameworks_base portal cassandra liferay
Topic #2:
wildfly spring hibernate elasticsearch framework bigbluebutton netty liferay orm cassandra androidannotations portal platform_frameworks_base
Topic #3:
platform_frameworks_base wildfly liferay portal netty bigbluebutton androidannotations framework elasticsearch spring cassandra hibernate orm
Topic #4:
orm hibernate wildfly platform_frameworks_base spring cassandra bigbluebutton elasticsearch androidannotations portal liferay netty framework
Topic #5:
liferay framework androidannotations portal orm spring bigbluebutton elasticsearch hibernate platform_fram

In [5]:
def print_full(x):
    pd.set_option('display.max_cols')
    print(x)
    pd.reset_option('display.max_rows')

In [6]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
df = pd.DataFrame(lda.components_)
df.columns = tf_feature_names
df
# print_full(df)


Unnamed: 0,androidannotations,bigbluebutton,cassandra,elasticsearch,framework,hibernate,liferay,netty,orm,platform_frameworks_base,portal,spring,wildfly
0,1.160414,0.780085,1.119434,0.682189,0.743249,0.780858,1.029484,0.759556,0.783337,0.77515,1.086674,0.735939,0.581064
1,0.799101,1.202158,0.650373,0.951796,0.862462,0.68827,0.616478,0.843549,0.841355,0.677368,0.652624,0.714402,0.719431
2,0.639969,0.735402,0.687752,0.779103,0.748662,0.782179,0.705713,0.724742,0.693375,0.632709,0.638909,0.784749,0.859564
3,0.760091,0.780247,0.702219,0.72837,0.754778,0.66953,0.869986,0.784969,0.645014,1.113746,0.789782,0.721883,1.013
4,0.776832,0.803112,0.809359,0.780224,0.662507,1.097508,0.721147,0.697893,1.168896,0.818353,0.731458,0.810408,0.845535
5,0.819597,0.778321,0.673944,0.772531,0.851114,0.719631,0.89312,0.696347,0.790476,0.705611,0.800823,0.779772,0.674371
6,0.762848,0.797694,0.686569,0.83469,1.020041,0.882066,0.743592,0.703694,0.670713,0.719448,0.715216,1.173972,0.665509
7,0.81362,0.680634,0.644463,1.009442,0.77404,0.67833,0.801214,0.694047,0.701886,0.725362,0.655592,0.79695,0.640738
8,0.673608,0.754491,0.661026,0.770228,0.789278,0.738344,0.919016,0.85192,0.724283,0.733501,0.833331,0.799219,0.748946
9,0.698543,0.771175,0.743619,0.818673,0.773928,0.684413,0.721399,1.201284,0.747644,0.717636,0.786148,0.762099,0.767659


In [7]:
processed_path = []
for project_name, project_type in config_data.items():
    processed_path.append(os.path.join(data_dir, project_name, "concateneated.out"))

In [9]:
print(processed_path)

['/home/hshahin/workspaces/Spring2016_SE_Project/data/platform_frameworks_base/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/bigbluebutton/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/cassandra/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/spring-framework/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/netty/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/liferay-portal/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/elasticsearch/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/wildfly/concateneated.out', '/home/hshahin/workspaces/Spring2016_SE_Project/data/hibernate-orm/concateneated.out']
