<h1>Preprocesing</h1>

In [12]:
from __future__ import print_function
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os, re, string, ntpath, keyword

project_path = '/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations'

# Python keywords
python_keywords = keyword.kwlist

# Java keywords from https://docs.oracle.com/javase/tutorial/java/nutsandbolts/_keywords.html
java_keywords = ["abstract","continue","for","new","switch","assert","default","goto","package","synchronized",
                 "boolean","do","if","private","this","break","double","implements","protected","throw",
                 "byte","else","import","public","throws","case","enum","instanceof","return","transient","catch",
                 "extends","int","short","try","char","final","interface","static","void","class","finally","long",
                 "strictfp","volatile","const","float","native","super","while"]

# Ruby keywords from http://docs.ruby-lang.org/en/2.2.0/keywords_rdoc.html
ruby_keywords = ["__ENCODING__","__LINE__","__FILE__","BEGIN","END","alias","and","begin","break",
                 "case","class","def","defined?","do","else","elsif","end","ensure","false","for","if",
                 "in","module","next","nil","not","or","redo","rescue","retry","return","self","super",
                 "then","true","undef","unless","until","when","while","yield"]

# PHP keywords form http://php.net/manual/en/reserved.keywords.php
php_keywords = ["__halt_compiler","abstract","and","array","as","break","callable","case","catch","class","clone",
                "const","continue","declare","default","die","do","echo","else","elseif","empty","enddeclare",
                "endfor","endforeach","endif","endswitch","endwhile","eval","exit","extends","final","finally",
                "for","foreach","function","global","goto","if","implements","include","include_once","instanceof",
                "insteadof","interface","isset","list","namespace","new","or","print","private","protected",
                "public","require","require_once","return","static","switch","throw","trait","try","unset","use",
                "var","while","xor","yield"]

all_keywords = python_keywords + java_keywords + ruby_keywords + php_keywords


def path_leaf(path):
    head, tail = ntpath.split(path)
    return head, tail

# split camel case tokens
_underscorer1 = re.compile(r'(.)([A-Z][a-z]+)')
_underscorer2 = re.compile('([a-z0-9])([A-Z])')


def camel_to_spaces(s):
    """
    convert camel case into spaces seperated
    """
    subbed = _underscorer1.sub(r'\1 \2', s)
    return _underscorer2.sub(r'\1 \2', subbed).lower()

def snake_to_spaces(snake_cased_str):
    """
    convert snake case into spaces seperated
    """
    separator = "_"
    components = snake_cased_str.split(separator)
    if components[0] == "":
      components = components[1:]
    if components[-1] == "":
      components = components[:-1]
    if len(components) > 1:
      spaced_str = components[0].lower()
      for x in components[1:]:
        spaced_str += " " + x.lower()
    else:
      spaced_str = components[0]
    return spaced_str


def file_preprocessing(input_file, output_file):
    """
    - replace punctuations with spaces
    - stemming
    - camel to spaces and snake to spaces
    - remove language spesific keywords
    - write the entire project snapshot into one file under project root folder
    """

    # replace the punctuations with space
    replace_punctuation = string.maketrans(string.punctuation, ' '*len(string.punctuation))
    # stemming
    stemmer = PorterStemmer()

    with open(input_file,'r') as inFile, open(output_file,'w') as outFile:
        for line in inFile:
            # replace punctuations and convert camel case into space seperated
            line_witout_puncs = ' '.join([snake_to_spaces(camel_to_spaces(word)) for word in line.translate(replace_punctuation).split()
                  if len(word) >=4 and word not in stopwords.words('english') and word not in all_keywords])

            # stemming
            singles = []
            for plural in line_witout_puncs.split():
                singles.append(stemmer.stem(plural))
            line_stemmed = ' '.join(singles)

            # Remove language keywords

            print(line_stemmed, file=outFile)


def return_file_type(project_path, file_type):
    project_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(project_path)
             for name in files
             if name.endswith((file_type))]
    return project_files

def project_preprocessing(project_path):
    # process project source code files and save each file as .java.proc 
    project_files = return_file_type (project_path, ".java")
    for source_file in project_files:
        head, tail = path_leaf(source_file)
        proc_file = os.path.join(head , tail + '.proc')
        file_preprocessing(source_file, proc_file)
        
    # concatenate all processed project files into one file under root directory
    project_proc_files = return_file_type (project_path, ".java.proc")
    with open(os.path.join(project_path , "concatenated.out"), 'w') as outfile:
        for fname in project_proc_files:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)
                    
project_preprocessing(project_path)

<h1>Topic Modeling</h1>

In [2]:
from __future__ import print_function
import os
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups

n_features = 1000
n_topics = 10
n_top_words = 20

def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

project_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(project_path)
             for name in files
             if name.endswith((".java.proc"))]

# Use tf (raw term count) features for LDA.
print("Extracting tf features for LDA...")
tf_vectorizer = CountVectorizer(max_df=0.95, min_df=2, max_features=n_features, stop_words='english')

t0 = time()
tf = tf_vectorizer.fit_transform(project_files)
print("done in %0.3fs." % (time() - t0))

print("Fitting LDA models with tf features")
lda = LatentDirichletAllocation(n_topics=n_topics, max_iter=5,
                                learning_method='online', learning_offset=50.,
                                random_state=0)
t0 = time()
lda.fit(tf)
print("done in %0.3fs." % (time() - t0))

print("\nTopics in LDA model:")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

Extracting tf features for LDA...
done in 0.049s.
Fitting LDA models with tf features
done in 1.194s.

Topics in LDA model:
Topic #0:
process activityintentbuilder rest builder spring main ormlite api internal core annotations handler bundlehelper view old res activitycompat support copyannotations rclass
Topic #1:
roboguice main otto helper testutils test ormlite myfragment holder enhancedclass core emptydependency bundlehelper event handler process internal activitywithextra examples sample
Topic #2:
eprovider myprovider ormlite roboguice myservice someinterface myfragment myreceiver inheritance trace internal appender view android event receiver plugin viewbyid intentbuilder fragment
Topic #3:
ormlite rest spring main handler myfragment android eintentservice activityinmanifest copyannotations trace afterviews inheritance eview afterinject app v4 view hierarchyviewer resources
Topic #4:
test core main ebean preference efragment menu receiver prefs instancestate keyevents eviewgroup 

In [13]:
def print_full(x):
    pd.set_option('display.max_cols')
    print(x)
    pd.reset_option('display.max_rows')

In [14]:
import pandas as pd
pd.set_option('display.max_colwidth', 1000)
df = pd.DataFrame(lda.components_)
df.columns = tf_feature_names
df
# print_full(df)


Unnamed: 0,activitycompat,activityinmanifest,activityintentbuilder,activitywithextra,afterextras,afterinject,afterviews,android,androidmanifestfindertest,annotations,...,someinterface,spring,support,test,testutils,trace,v4,v7,view,viewbyid
0,0.261846,0.240007,1.806215,0.219839,0.232441,0.240178,0.245613,0.235771,0.24069,0.389625,...,0.25124,0.84847,0.260991,0.232554,0.224273,0.217504,0.227508,0.226639,0.26467,0.216424
1,0.246896,0.219525,0.212075,0.441164,0.23876,0.219055,0.244354,0.222282,0.223902,0.247007,...,0.232189,0.224097,0.21492,2.125846,5.510001,0.231588,0.231956,0.250599,0.228295,0.235366
2,0.233039,0.232366,0.238085,0.221048,0.217371,0.229362,0.223704,0.248467,0.245813,0.218946,...,0.272913,0.202409,0.237732,0.241606,0.223616,0.25455,0.221167,0.245747,0.249886,0.247218
3,0.197753,0.257693,0.242355,0.233262,0.237294,0.250996,0.254188,0.277891,0.232871,0.219311,...,0.241014,0.844034,0.220673,0.241514,0.232608,0.255336,0.249908,0.219575,0.249873,0.232894
4,0.246986,0.229185,0.237331,0.240749,1.961787,4.315326,3.426373,0.337503,1.377393,0.238947,...,1.917317,5.900014,0.221018,568.992706,0.24597,1.981733,0.248038,0.216457,0.238304,3.451929
5,0.91081,0.224215,0.260467,0.221586,0.234302,0.233793,0.23413,4.961688,0.225311,0.245779,...,0.255839,0.227979,10.34254,6.842257,0.228563,0.248396,1.83882,2.657713,1.049785,0.246162
6,0.237346,0.255659,0.227974,0.221426,0.239411,0.231826,0.220413,0.240504,0.227999,0.245073,...,0.231399,0.219386,0.225146,0.264827,0.231984,0.217529,0.234646,0.232039,0.22997,0.237237
7,1.101276,0.430504,0.703011,0.245635,0.333568,0.287445,0.298047,0.276576,0.70927,0.7732,...,0.259398,0.2221,0.662897,8.572027,0.257022,0.417862,0.257094,0.253059,0.658729,0.245722
8,0.235435,1.642186,0.241568,1.462358,0.25928,0.225743,0.236353,0.206607,0.23878,0.231941,...,0.241547,0.451575,0.216785,0.40286,0.219842,0.242213,0.244876,0.262569,0.240587,0.239067
9,0.243874,0.219461,0.252387,0.261781,1.026292,0.435709,0.418942,0.234951,0.245842,113.083778,...,0.247471,268.579239,0.238867,65.081077,0.240459,0.940396,0.212025,0.242015,3.641126,0.308676


<h2>Get all .java files</h2>

In [5]:
import os

javafiles = [os.path.join(root, name)
             for root, dirs, files in os.walk(project_path)
             for name in files
             if name.endswith((".java"))]

javafiles[:5]

['/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations/examples/RoboGuiceExample/src/main/java/org/androidannotations/roboguiceexample/RoboGuiceExampleApplication.java',
 '/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations/examples/RoboGuiceExample/src/main/java/org/androidannotations/roboguiceexample/GreetingService.java',
 '/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations/examples/RoboGuiceExample/src/main/java/org/androidannotations/roboguiceexample/GreetingServiceToastImpl.java',
 '/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations/examples/RoboGuiceExample/src/main/java/org/androidannotations/roboguiceexample/AstroModule.java',
 '/home/hshahin/workspaces/Spring2016_SE_Project/data/androidannotations/examples/RoboGuiceExample/src/main/java/org/androidannotations/roboguiceexample/SimpleActivity.java']