<h1>Globals</h1>

In [36]:
from __future__ import print_function
from datetime import datetime
from git import Repo, Git
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import time
import os.path
import re, string, ntpath, keyword, json, codecs
import threading
import shutil, errno

skip_projects = [ "meteor"]

tag_names = ["2016-06","2016-01","2015-06","2015-01","2014-06","2014-01","2013-06","2013-01",
"2012-06","2012-01","2011-06","2011-01","2010-06","2010-01","2009-06","2009-01",
"2008-06","2008-01","2007-06","2007-01","2006-06","2006-01","2005-06","2005-01",
"2004-06","2004-01","2003-06","2003-01"]

project_type_map = {
	"androidannotations-tags": ".java",
	"bigbluebutton-tags": ".java",
	"cassandra-tags": ".java",
	"elasticsearch-tags": ".java",
	"hibernate-orm-tags": ".java",
	"liferay-portal-tags": ".java",
	"netty-tags": ".java",
	"platform_frameworks_base-tags": ".java",
	"spring-framework-tags": ".java",
	"wildfly-tags": ".java",
	"laravel-tags": ".php",
	"symfony-tags": ".php",
	"cakephp-tags": ".php",
	"CodeIgniter-tags": ".php",
	"rails-tags": ".rb",
	"sinatra-tags": ".rb",
	"padrino-framework-tags": ".rb",
	"hanami-tags": ".rb",
	"pakyow-tags": ".rb",
	"flask-tags": ".py",
	"django-tags": ".py",
	"web2py-tags": ".py",
	"frappe-tags": ".py",
	"ninja-tags": ".java",
	"meteor-tags": "javascript",
	"express-tags": "javascript",
	"sails-tags": "javascript",
	"mean-tags": "javascript",
	"derby-tags": "javascript",
	"nodal-tags": "javascript"
}

def load_config(config_file):
    """
    Load projects configuration file.
    """
    with open(config_file) as data_file:    
        config_data = json.load(data_file)
    return config_data

def copy_folder(src, dst):
    try:
        if os.path.exists(dst):
            shutil.rmtree(dst)
        shutil.copytree(src, dst)
    except OSError as exc: # python >2.5
        if exc.errno == errno.ENOTDIR:
            shutil.copy(src, dst)
        else: raise
            
def get_immediate_subdirectories(a_dir):
    return [name for name in os.listdir(a_dir)
            if os.path.isdir(os.path.join(a_dir, name))]

base_dir = "/home/hshahin/workspaces/Spring2016_SE_Project"
data_dir = os.path.join(base_dir , "data")
config_file = "projects_config.json"
config_data = load_config(os.path.join(base_dir , config_file))

In [37]:
# path = '/home/hshahin/workspaces/Spring2016_SE_Project/data'
# get_immediate_subdirectories(path)

<h1>Creating tags functions</h1>

In [38]:
def get_date_time(epoch):
    '''
    convert epoch to date_time
    '''
    return time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(epoch))

def tag_exists(path, tag_name):
    repo = Repo(path)
    
    return True if tag_name in repo.tags else False

def get_epoch(year, month='01'):
    """
    calculate the epoch of first day of a year-month
    """
    pattern = '%Y.%m.%d %H:%M:%S'
    return int(time.mktime(time.strptime(str(year) + '.' + str(month) + '.01 00:00:00', pattern)))


def create_tags(path):
    '''
    takes repo path and creates tags for first commit in Jan and Jun. for every year
    # get the list of commits
    # get the latest commit date
    # current_year is the year from that date
    # loop through the list of commit to find the commit having a date equal or just after 1/1/current_year
    # once found create a tage with the current_year name on it AND
    # subtract 1 from the year and continue.

    '''
    repo = Repo(path)

    # get the list of commits
    commits = list(repo.iter_commits())

    # get the latest commit date, current_year is the year from that date
    current_year = datetime.fromtimestamp(commits[0].committed_date).year


    for idx, commit in enumerate(commits):
        # time.sleep(2)
        # print(commits[idx].hexsha)

        current_year_01 = str(current_year)+'-01'
        current_year_06 = str(current_year)+'-06'

        try:
            if get_epoch(current_year, '01') > commit.committed_date and \
                    int(time.time()) > get_epoch(current_year, '01')  and \
                    idx !=0:
                if str(current_year_01) not in repo.tags and idx != 0:
                    print(commits[idx-1].hexsha+' '+get_date_time(commits[idx-1].committed_date)+' '+current_year_01)
                    past = repo.create_tag(current_year_01, ref=commits[idx-1],
                                      message="This is a tag to mark the first commit in year %s" % current_year_01)
                current_year = datetime.fromtimestamp(commit.committed_date).year

            if get_epoch(current_year, '06') > commit.committed_date and \
                    int(time.time()) > get_epoch(current_year, '06') and \
                idx != 0:
                if str(current_year_06) not in repo.tags:
                    print(commits[idx-1].hexsha+' '+get_date_time(commits[idx-1].committed_date)+' '+current_year_06)
                    past = repo.create_tag(current_year_06, ref=commits[idx-1],
                                      message="This is a tag to mark the first commit in year %s" % current_year_06)
        except AttributeError:
            pass

def checkout_tag(path, tag_name):
    '''
    checks out a tag if it exists
    '''
    repo = Repo(path)
    git = Git(path)
    if tag_name in repo.tags:
        git.checkout(tag_name)

def delete_tags(path):
    '''
    remove all tags in a given repo
    '''

    repo = Repo(path)

    for tag in repo.tags:
        repo.delete_tag(tag)

<h1>Create tags every 6 months for each repo</h1>

In [39]:
# for project_name, project_type in config_data.items():
#     print("Processing project: " + project_name )
#     t0 = time.time()
#     delete_tags(os.path.join(data_dir, project_name))
#     create_tags(os.path.join(data_dir, project_name))
#     print("Project: " + project_name + " taged in %0.3fs." % (time.time() - t0))

<h1>Preprocesing functions</h1>

In [40]:
# Python keywords
python_keywords = keyword.kwlist

# Java keywords from https://docs.oracle.com/javase/tutorial/java/nutsandbolts/_keywords.html
java_keywords = ["abstract","continue","for","new","switch","assert","default","goto","package","synchronized",
                 "boolean","do","if","private","this","break","double","implements","protected","throw",
                 "byte","else","import","public","throws","case","enum","instanceof","return","transient","catch",
                 "extends","int","short","try","char","final","interface","static","void","class","finally","long",
                 "strictfp","volatile","const","float","native","super","while"]

# Ruby keywords from http://docs.ruby-lang.org/en/2.2.0/keywords_rdoc.html
ruby_keywords = ["__ENCODING__","__LINE__","__FILE__","BEGIN","END","alias","and","begin","break",
                 "case","class","def","defined?","do","else","elsif","end","ensure","false","for","if",
                 "in","module","next","nil","not","or","redo","rescue","retry","return","self","super",
                 "then","true","undef","unless","until","when","while","yield"]

# PHP keywords form http://php.net/manual/en/reserved.keywords.php
php_keywords = ["__halt_compiler","abstract","and","array","as","break","callable","case","catch","class","clone",
                "const","continue","declare","default","die","do","echo","else","elseif","empty","enddeclare",
                "endfor","endforeach","endif","endswitch","endwhile","eval","exit","extends","final","finally",
                "for","foreach","function","global","goto","if","implements","include","include_once","instanceof",
                "insteadof","interface","isset","list","namespace","new","or","print","private","protected",
                "public","require","require_once","return","static","switch","throw","trait","try","unset","use",
                "var","while","xor","yield"]

all_keywords = python_keywords + java_keywords + ruby_keywords + php_keywords


def path_leaf(path):
    head, tail = ntpath.split(path)
    return head, tail

# split camel case tokens
_underscorer1 = re.compile(r'(.)([A-Z][a-z]+)')
_underscorer2 = re.compile('([a-z0-9])([A-Z])')


def camel_to_spaces(s):
    """
    convert camel case into spaces seperated
    """
    subbed = _underscorer1.sub(r'\1 \2', s)
    return _underscorer2.sub(r'\1 \2', subbed).lower()

def snake_to_spaces(snake_cased_str):
    """
    convert snake case into spaces seperated
    """
    separator = "_"
    components = snake_cased_str.split(separator)
    if components[0] == "":
        components = components[1:]
    if components[-1] == "":
        components = components[:-1]
    if len(components) > 1:
        spaced_str = components[0].lower()
        for x in components[1:]:
            spaced_str += " " + x.lower()
    else:
        spaced_str = components[0]
    return spaced_str


def file_preprocessing(input_file, output_file):
    """
    - replace punctuations with spaces
    - stemming
    - camel to spaces and snake to spaces
    - remove language spesific keywords
    - write the entire project snapshot into one file under project root folder
    """
    # print("processing file " + input_file)
    # replace the punctuations with space
    replace_punctuation = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    # stemming
    stemmer = PorterStemmer()

    with open(input_file, 'r', encoding='utf-8', errors='replace') as inFile, open(output_file,'w') as outFile:
        for line in inFile:
            # replace punctuations
            # convert camel case into space separated
            # convert snake case into space separated             
            # remove language keywords
            line_witout_puncs = ' '.join([snake_to_spaces(camel_to_spaces(word)) 
                                          for word in line.translate(replace_punctuation).split()
                                          if len(word) >=4 and word not in stopwords.words('english') 
                                          and word not in all_keywords])

            # stemming
            singles = []
            for plural in line_witout_puncs.split():
                try:
                    singles.append(stemmer.stem(plural))
                except UnicodeDecodeError:
                    print(plural) 

            line_stemmed = ' '.join(singles)
            print(line_stemmed, file=outFile)


def return_file_type(project_path, file_type):
    project_files = [os.path.join(root, name)
             for root, dirs, files in os.walk(project_path)
             for name in files
             if name.endswith((file_type))]
    return project_files

def project_preprocessing(project_path, file_type, tag_name):
    # print ("processing project "+ project_path)
    # process project source code files and save each file as .*.proc 
    project_files = return_file_type (project_path, file_type)
    for source_file in project_files:
        head, tail = path_leaf(source_file)
        proc_file = os.path.join(head , tail + '.proc')
        file_preprocessing(source_file, proc_file)
        
    # concatenate all processed project files into one file under root directory
    project_proc_files = return_file_type (project_path, file_type + '.proc')
    with open(os.path.join(project_path , "final-processed.out"), 'w') as outfile:
        for fname in project_proc_files:
            with open(fname) as infile:
                for line in infile:
                    outfile.write(line)

<h1>checkout tags in separate folders</h1>

In [41]:
# # create folder project_tags
# # for each tag if tag exists
# # copy the project into project_tag/tag_name
# # checkout ptoject to tag_name
# # delete .git folder
# def checkout_projects():
#     for project_name, project_type in config_data.items():
#         project_path = os.path.join(data_dir, project_name)
#         project_tags_path = project_path + '-tags'
        
#         if project_name not in skip_projects:
#             if not os.path.exists(project_tags_path):
#                 os.makedirs(project_tags_path)

#             repo = Repo(project_path)
#             for tag_name in tag_names:
#                 if tag_exists(project_path, tag_name):
#                     print("Copying "+project_name+' '+tag_name)
#                     current_tag_path = os.path.join(project_tags_path, tag_name)
#                     copy_folder(project_path, current_tag_path)

#             for tag_name in tag_names:
#                 if tag_exists(project_path, tag_name):
#                     print("Checkout "+project_name+' '+tag_name)
#                     current_tag_path = os.path.join(project_tags_path, tag_name)
#                     checkout_tag(current_tag_path, tag_name)

#             for tag_name in tag_names:
#                 if tag_exists(project_path, tag_name):
#                     print("deleting .git "+project_name+' '+tag_name)
#                     current_tag_path = os.path.join(project_tags_path, tag_name)
#                     os.chdir(current_tag_path)
#                     shutil.rmtree(os.path.join(current_tag_path, '.git'))

# checkout_projects()

<h1>Run preprocessing</h1>

In [42]:
# from multiprocessing import Pool

# def run_preprocessing(project_tags_dir):
#     project_tags_path = os.path.join(data_dir , project_tags_dir)
#     print('---------------'+project_tags_path)

#     for project_tag in get_immediate_subdirectories(project_tags_path):
#         project_tag_path = os.path.join(project_tags_path , project_tag)
#         t0 = time.time()
#         project_preprocessing(project_tag_path, project_type_map[project_tags_dir], project_tag)
#         print("processing project: " + project_tags_path + "\t tag " 
#               + project_tag + " done in %0.3fs." % (time.time() - t0))
#     print('****This thread is done:', os.getpid())
     

# project_tags_paths = get_immediate_subdirectories(data_dir)
# pool = Pool(16)
# pool.map(run_preprocessing, project_tags_paths)
   
# # for project_tags_path in get_immediate_subdirectories(data_dir):
# #     preprocessing_threads = threading.Thread(target=run_preprocessing, args=(project_tags_path, ))
# #     preprocessing_threads.start()
    
# print('Main process Done...............')

<h1>Topic Modeling</h1>

In [175]:
from __future__ import print_function
import os
from time import time
from os import listdir
from os.path import isdir

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from sklearn.datasets import fetch_20newsgroups
import pandas as pd
import numpy as np
import pickle

import lda

# pd.set_option('display.mpl_style', 'default') 
pd.set_option('display.width', 5000) 
pd.set_option('display.max_columns', 60) 
import matplotlib.pyplot as plt


def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print("Topic #%d:" % topic_idx)
        print(" ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]))
    print()

def get_top_words(model, feature_names, n_top_words):
    df = pd.DataFrame(columns=['word'+str(i) for i in range(n_top_words)])
    df_freq = pd.DataFrame(columns=['word'+str(i) for i in range(n_top_words)])
    for topic_idx, topic in enumerate(model.components_):
        df.loc['topic#'+str(topic_idx)] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
        df_freq.loc['topic#'+str(topic_idx)] = [topic[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    
    return df, df_freq


# def get_top_doc_topic(projects_topics, project_names, n_top_words):
#     df = pd.DataFrame(columns=['word'+str(i) for i in range(n_top_words)])
#     for topic_idx, topic in enumerate(model.components_):
#         df.loc['topic#'+str(topic_idx)] = [feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
#         df.loc['freq#'+str(topic_idx)] = [topic[i] for i in topic.argsort()[:-n_top_words - 1:-1]]
    
#     return df

def print_full(x):
    pd.set_option('display.max_colwidth', 1000)
    print(x)
    pd.reset_option('display.max_rows')

# -------------------------------------------------------------------

def run_lda_sklearn(X):
        lda = LatentDirichletAllocation(n_topics=n_topics, 
                                    max_iter=400,
                                    learning_method='online', 
                                    learning_offset=50.,
                                    random_state=0, 
                                    n_jobs=10)
               
        projects_topics = lda.fit_transform(X)
                        
        return projects_topics, lda

# -------------------------------------------------------------------

def run_lda_other(X):
    lda = lda.LDA(n_topics=n_topics, n_iter=500, random_state=1)        
    lda.fit_transform(X)
   
    return lda.doc_topic_, lda

# -------------------------------------------------------------------


n_features = 10000
n_topics = 10
# n_top_words = 100

max_df = 0.7
min_df = 0.2
lang = 'all_'


for max_df in [0.7]:#[0.7, 0.6, 0.5]:
    for min_df in [.4]:#[0.1, 0.2, 0.3]:
        suffix = lang+str(n_topics)+'_'+str(max_df)+'_'+str(min_df)
        print('------suffix:', suffix)

#         config_file = "projects_config_webFrameworks.json"
#         config_file = 'projects_config_topGithub.json'
        config_file = "projects_config.json"
        selected_projects = load_config(os.path.join(base_dir , config_file))


        # Read projects into strings
        projects_data = []
        project_names = []
        for i, project_name in enumerate(selected_projects.keys()):
#             if selected_projects[project_name] != 'python':
#                 continue
                
            # For each snapshot of the project
            snapshots = []
            project_path = os.path.join(data_dir, project_name+'-tags')
            try:
                snapshots = [os.path.join(project_path, p) 
                             for p in listdir(project_path) if isdir(os.path.join(project_path, p))]
            except FileNotFoundError:
                 print('------Project Not found: '+project_name)   
            for snapshot in snapshots:
                project_names.append('_'.join(snapshot.split('/')[-2:]))
                # print(project_names[-1])
                processed_path = os.path.join(snapshot, "final-processed.out")
                with open(processed_path, 'r') as myfile:
                    projects_data.append(myfile.read().replace('\n', ' '))


        print('num of projects: ', len(projects_data))

        # Use tf (raw term count) features for LDA.
        print("Extracting tf features for LDA...")
        count_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, max_features=n_features, stop_words='english')
        tfidf_vectorizer = TfidfTransformer()
    
        t0 = time()
        counts = count_vectorizer.fit_transform(projects_data)
#         tfidf = tfidf_vectorizer.fit_transform(counts)
        X = counts   # <---------
        tf_feature_names = count_vectorizer.get_feature_names()
        # tf_feature_names = tfidf_vectorizer.get_feature_names()
        print("done in %0.3fs." % (time() - t0))
        print('X shape:', X.shape)

        
        print("Fitting LDA models with tf features")
               
        t0 = time()       
#         projects_topics, lda = run_lda_other(X) # counts 
        projects_topics, lda = run_lda_sklearn(X) # counts 
        topic_word, topic_word_freq = get_top_words(lda, tf_feature_names, 50)
        print("done in %0.3fs." % (time() - t0))
                
        # -------------------------------------------------------------------
        # Save lda into a pickle file        
#         pickle.dump(lda, open(os.path.join(base_dir, 'results/lda_'+suffix+'.p'), 'wb'))
     
        topic_word.to_csv(os.path.join(base_dir, 'results/topic_word_'+suffix+'.csv'))
        topic_word_freq.to_csv(os.path.join(base_dir, 'results/topic_word_freq_'+suffix+'.csv'))

        projects_topics = pd.DataFrame(projects_topics, columns=['topic'+str(i) for i in range(n_topics)]) 
        projects_topics['project'] = project_names
        projects_topics['project'] = projects_topics['project'].apply(lambda x: x.split('_')[0].split('-')[0])
        projects_topics['date'] = project_names
        projects_topics['date'] = projects_topics['date'].apply(lambda x: x.split('_')[1])
        projects_topics.index = project_names
        projects_topics.to_csv(os.path.join(base_dir, 'results/project-topic_'+suffix+'.csv'))

# lda = pickle.load(open("lda_5_1.p", "rb"))


------suffix: all_10_0.7_0.4
------Project Not found: padrino-framework
------Project Not found: liferay-portal
------Project Not found: platform_frameworks_base
------Project Not found: hibernate-orm
num of projects:  309
Extracting tf features for LDA...
done in 257.493s.
X shape: (309, 1407)
Fitting LDA models with tf features
done in 728.594s.


In [172]:
# lda.components_


'/home/hshahin/workspaces/Spring2016_SE_Project'

## 1- Topic-word

In [176]:
# print("\nTopics-words in LDA model:")

topic_word

Unnamed: 0,word0,word1,word2,word3,word4,word5,word6,word7,word8,word9,word10,word11,word12,word13,word14,word15,word16,word17,word18,word19,word20,word21,word22,word23,word24,word25,word26,word27,word28,word29,word30,word31,word32,word33,word34,word35,word36,word37,word38,word39,word40,word41,word42,word43,word44,word45,word46,word47,word48,word49
topic#0,datetim,articl,python,middlewar,book,func,backend,foreign,decim,tupl,opt,verbos,inlin,person,migrat,signal,lazi,aggreg,2006,month,citi,dictionari,feed,categori,conf,keyword,upload,recent,transform,timezon,column,layer,primari,digit,ticket,annot,factori,distanc,router,defer,fixtur,stdout,editor,perm,pagin,oracl,builtin,publish,bit,num
topic#1,bean,factori,annot,web,basi,unless,complianc,agre,govern,impli,warranti,listen,sql,2002,persist,invoc,synchron,editor,abstract,statement,illeg,processor,ha,writer,schedul,descriptor,aspect,awar,executor,deleg,new,matcher,metadata,destin,convers,uri,row,qualifi,rollback,accessor,export,async,concurr,thrown,callabl,column,person,suppress,transform,creator
topic#2,mapper,aggreg,transport,hit,stat,factori,snapshot,analyz,listen,new,alloc,int,score,impli,basi,unless,agre,warranti,govern,complianc,contributor,regard,agreement,matcher,long,num,max,suggest,pool,master,repositori,illeg,plugin,analysi,metadata,pipelin,primari,ha,channel,min,highlight,ref,distanc,bulk,abstract,segment,processor,concurr,writer,milli
topic#3,deploy,contributor,warranti,foundat,factori,bean,home,archiv,impli,fit,merchant,publish,redistribut,hope,floor,fifth,persist,annot,web,pool,writer,listen,registr,connector,timer,role,transform,abstract,queue,impl,ha,marshal,processor,endpoint,invoc,2011,illeg,metadata,spec,executor,batch,bundl,channel,phase,concurr,new,namespac,prop,descriptor,packag
topic#4,listen,room,meet,button,warranti,audio,big,blue,payload,codec,foundat,publish,contact,impli,ha,dialog,fit,merchant,factori,hope,redistribut,video,profil,conn,vector,descriptor,channel,new,screen,num,conf,2010,agent,gain,consum,sender,poll,fragment,int,intent,resp,bit,transport,annot,subscrib,pre,broadcast,scale,pipe,max
topic#5,column,person,topic,migrat,zone,month,fixtur,david,driver,bool,klass,categori,2005,primari,compani,peopl,rubi,row,conn,statement,segment,upload,proc,asset,plugin,accessor,flash,repli,team,guid,font,datetim,foreign,minut,2008,newlin,dirnam,precis,sql,sanit,2006,profil,commit,plural,day,alter,mysql,human,singular,var
topic#6,bundl,street,transform,kernel,factori,listen,metadata,formatt,fixtur,profil,ha,children,violat,foundat,role,finder,gmail,matcher,tran,bool,namespac,strategi,foo,selector,foobar,mask,question,verbos,var,abstract,scalar,timezon,prop,button,annot,bar,stub,max,sibl,collector,column,grant,accessor,offic,std,tester,uri,closur,upload,alias
topic#7,articl,plugin,2007,plural,foundat,fixtur,router,doctyp,column,shell,publish,appl,ha,2006,categori,2005,primari,book,var,redistribut,capit,pagin,contact,lib,foreign,db,singular,conn,datetim,network,month,theme,short,reg,tab,2012,bool,latin,2008,startup,exp,statement,children,driver,role,ajax,flash,diff,asset,day
topic#8,column,row,metadata,compact,slice,struct,endpoint,cell,mutat,factori,new,long,descriptor,inet,abstract,statement,uuid,unless,impli,agreement,agre,warranti,contributor,regard,basi,complianc,govern,foundat,ha,int,strategi,max,def,primari,writer,transport,hint,concurr,commit,interv,tupl,ks,alloc,disk,digest,super,predic,deseri,segment,executor
topic#9,channel,codec,new,factori,max,ssl,listen,pipelin,alloc,basi,warranti,impli,unless,agre,int,govern,complianc,chunk,executor,queue,bootstrap,writer,web,concurr,long,inet,pool,illeg,abstract,short,writabl,payload,ha,pend,bit,flow,shutdown,leak,2012,unsaf,zlib,sock,certif,composit,ref,num,sync,cert,transfer,unsign


In [177]:
topic_word_freq


Unnamed: 0,word0,word1,word2,word3,word4,word5,word6,word7,word8,word9,word10,word11,word12,word13,word14,word15,word16,word17,word18,word19,word20,word21,word22,word23,word24,word25,word26,word27,word28,word29,word30,word31,word32,word33,word34,word35,word36,word37,word38,word39,word40,word41,word42,word43,word44,word45,word46,word47,word48,word49
topic#0,36080.261612,31617.629884,22231.891064,20857.97082,20671.401381,15814.656419,14762.360049,12682.592104,11975.534778,11688.983739,11495.561211,11103.071948,10684.036653,9516.074908,9340.795052,8326.250924,6954.169292,6494.653073,6297.350565,6186.865259,5879.516377,5868.443408,5799.715001,5666.137025,5422.979876,5253.586016,5107.572222,4922.887748,4766.716005,4711.247517,4641.62261,4585.631269,4561.171311,4533.992421,4529.13826,4421.656243,4343.756818,4276.117478,3921.718331,3840.269016,3733.099845,3718.576684,3685.320181,3645.37247,3612.949746,3578.174806,3525.61476,3454.772238,3402.973345,3313.831438
topic#1,804172.312806,343862.741749,212875.833881,74433.98692,67534.643222,67384.95198,67316.576441,67246.575151,67217.953487,64690.341044,62611.678958,60270.046751,58577.195352,54474.676155,48826.395798,47586.855559,44891.649443,43201.712234,42635.884001,42193.031968,40186.816415,37890.708771,34914.108574,34641.345836,33293.088371,33007.009372,31244.135578,28816.630951,28810.098143,27716.100944,26814.344138,26315.405604,26282.720054,25969.89866,23754.044196,21664.191838,21611.692083,21246.528198,20910.434237,20060.066563,19564.810546,19352.826682,19352.546594,19314.52798,18976.428997,18533.762951,18528.75437,17804.208001,17390.710963,16657.452906
topic#2,179856.584391,171612.634424,114616.08476,114441.62482,110268.08486,93120.616138,90839.25086,83489.734762,81940.06143,78563.127085,77224.739523,76664.92344,67857.381659,67773.094088,67406.133603,67216.807022,67101.851174,67003.88586,66979.187634,66973.18834,64883.18704,64268.872185,63980.988381,60545.612559,60319.670041,54334.945233,52184.673808,50376.554796,49031.217701,46876.120682,46025.034673,42855.552518,41247.315885,40747.551792,39824.353088,39334.786816,39272.213483,36513.947835,35665.261489,33905.94517,33737.467165,31996.931162,31542.716823,30648.265116,30212.003949,29372.415295,28740.126663,27840.654395,26175.509348,26171.485017
topic#3,357419.305111,178391.270089,178132.576505,173530.99857,164673.719727,155340.249352,118111.522242,94022.470994,91280.89604,88638.619427,88627.343853,88047.187369,87500.214427,87286.904125,87082.575135,86805.222998,77445.930519,76631.338274,70650.288921,56102.339213,55976.598907,54299.130609,51151.167737,50147.583501,50015.529733,48793.251201,48232.584063,39657.863384,38659.63985,37816.375594,37434.735426,36899.445727,35281.529168,34734.730747,34568.32688,33708.600133,33526.795715,32656.904258,32402.346818,31411.901391,30915.790869,29534.329761,29485.814846,29340.510001,29029.002882,28629.6604,27953.436012,27702.10261,27487.895771,26752.32581
topic#4,90606.933966,54273.491823,52554.545198,49258.850929,47937.242679,46750.024001,45282.220165,44585.109605,42083.311412,35866.884669,29395.377996,27662.009864,26479.390931,25953.457629,23804.512417,22071.339371,20523.834177,20459.453731,20456.344639,19992.111183,19915.73992,19672.444929,18748.186237,17631.257619,16849.791889,16830.632886,15372.994877,15234.132917,14721.920324,14241.293757,14161.470933,14136.476475,13459.987434,12944.383297,12055.330108,11740.287869,11738.026189,11650.339767,11455.13897,11437.736991,10994.269531,10774.50995,10083.776393,9959.157056,9844.656814,9498.042273,9230.850775,8943.24576,8879.91611,8445.180867
topic#5,66437.646467,43423.616616,43132.934642,26417.844693,22638.441409,21756.542218,21672.926822,19501.03453,19313.21081,19212.996647,17663.3465,17407.285205,17206.323186,14829.346857,14813.699225,14730.831985,14125.985814,13289.095694,13228.012917,12226.642636,11537.47101,11354.858306,10507.307358,10295.067117,9559.793957,9463.071019,9431.756113,9393.878927,9368.331919,9324.909817,9321.698789,9220.666875,8780.713493,8277.007393,7971.297014,7888.413713,7878.02362,7834.575001,7679.968404,7648.259443,7637.708819,7637.586278,7556.603376,7553.114464,7481.736,7456.288442,7209.663547,7101.509205,6934.743692,6901.143263
topic#6,54650.044937,47896.138803,44923.811515,42501.379135,40138.109818,35315.842525,29356.006123,28494.793918,23122.033097,19837.7765,19677.333801,19257.154554,18561.383397,16426.663385,16249.877271,14572.391997,14341.991203,13894.937348,13335.422349,13085.218022,12833.460057,12574.667873,10959.956218,9716.214577,9226.206922,8996.676039,8952.410408,8775.804209,8756.222855,8673.890702,8513.611536,8295.777335,8166.179535,8033.915885,7900.600217,7667.331182,7534.137742,7453.565437,7379.41205,7281.332071,7238.516634,7183.464317,7118.664971,6932.241087,6922.282164,6919.186169,6835.670999,6300.815733,6077.911524,5928.68885
topic#7,46240.506087,40260.655693,24205.198786,23072.217368,16855.793283,16745.716256,16612.288927,15785.761223,12420.941394,12018.704418,11771.721747,11553.71822,10876.091947,10413.956686,9887.695009,9790.375156,9318.396371,9199.133349,8732.468044,8728.740289,8207.640921,8070.15822,7522.670331,6994.859425,6959.109263,6793.055664,6193.271563,5330.842446,4923.732995,4646.530879,4632.618382,4624.444271,4573.030972,4416.530874,4255.889107,4213.223808,4175.440273,4082.749036,3931.802435,3901.736052,3804.211643,3775.003093,3667.003796,3561.100976,3465.774331,3435.262763,3425.09757,3400.328765,3086.638787,3082.187927
topic#8,286445.898346,125393.971709,65849.918455,53456.61554,52505.806684,44621.768672,44197.03885,43362.958909,37876.026313,36793.042801,34226.585118,31419.59835,29276.042989,27178.897724,26985.461564,25868.805593,25608.389775,25159.72725,24920.032282,24667.097497,24630.136532,24609.26471,24607.545075,24606.560956,24604.276141,24572.362851,24567.178174,24563.186206,22745.548087,22043.6721,21980.424548,21957.127769,21772.808206,21530.358534,19918.625917,18483.490815,18371.313917,17898.406486,17734.091555,16889.775227,15976.874337,14940.68103,14770.561683,14480.495329,14258.654015,13994.53041,13656.341268,13399.748634,12967.660994,12708.096186
topic#9,364065.612955,48729.594133,40371.218378,39838.190884,32950.081986,32168.340396,31979.597006,30788.975433,29953.183671,29713.340567,29326.848362,28992.729255,28170.956468,28119.603169,28100.159528,28095.531835,28085.997726,27210.865111,25771.819599,25525.226367,24570.035456,21758.486849,19797.638502,18584.300651,17789.668679,17687.639653,16846.378626,15354.062777,13411.376938,12909.311859,11833.10206,11719.591523,11513.387821,10943.686869,10859.117364,10670.52837,10401.407179,9732.887581,9724.507002,9480.733239,9025.439973,8936.082643,8883.930724,8658.116996,8337.15334,8191.207519,8150.434826,8115.118566,7997.201181,7923.496331


# 2- Project-topic

In [168]:
projects_topics.head(15)

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,project,date
frappe-tags_2014-06,0.1,0.10002,631.871751,0.100018,0.1,0.1,0.10002,2650.328142,0.10002,0.10003,frappe,2014-06
frappe-tags_2012-01,0.1,0.100016,0.100022,0.100017,0.1,0.1,0.100018,1694.099888,0.10002,0.100019,frappe,2012-01
frappe-tags_2016-01,0.1,0.10001,6839.099901,0.100013,0.1,0.1,0.100011,0.100042,0.100011,0.100012,frappe,2016-01
frappe-tags_2013-06,0.1,0.100002,0.100017,0.100006,0.1,0.1,0.100004,3253.099962,0.100003,0.100006,frappe,2013-06
frappe-tags_2014-01,0.1,0.100004,0.100036,0.100007,0.1,0.1,0.100006,2889.099936,0.100005,0.100006,frappe,2014-01
frappe-tags_2012-06,0.1,0.1,0.100017,0.100003,0.1,0.1,0.100002,1707.099971,0.100002,0.100005,frappe,2012-06
frappe-tags_2015-01,0.1,0.100007,5599.721569,0.100008,0.1,0.1,0.100009,303.478389,0.100009,0.100008,frappe,2015-01
frappe-tags_2015-06,0.1,0.100006,6334.099927,0.100008,0.1,0.1,0.100008,0.100035,0.100007,0.100009,frappe,2015-06
frappe-tags_2013-01,0.1,0.100001,0.100015,0.100003,0.1,0.1,0.100002,2989.099974,0.100002,0.100004,frappe,2013-01
web2py-tags_2014-06,0.1,0.100023,0.100024,35839.099852,0.1,0.1,0.100027,0.100028,0.100027,0.100019,web2py,2014-06


In [169]:
df = projects_topics.copy()

for i in df.index:
    df.ix[i, df.columns[0:10]] = df.ix[i, df.columns[0:10]]/ df.ix[i, df.columns[0:10]].max()
# df
# print(i)
# print(df.ix[0, df.columns[0:10]])
# print(df.ix[0, df.columns[0:10]].max())
df

Unnamed: 0,topic0,topic1,topic2,topic3,topic4,topic5,topic6,topic7,topic8,topic9,project,date
frappe-tags_2014-06,3.8e-05,3.8e-05,0.238413,3.8e-05,3.8e-05,3.8e-05,3.8e-05,1.0,3.8e-05,3.8e-05,frappe,2014-06
frappe-tags_2012-01,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05,1.0,5.9e-05,5.9e-05,frappe,2012-01
frappe-tags_2016-01,1.5e-05,1.5e-05,1.0,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,1.5e-05,frappe,2016-01
frappe-tags_2013-06,3.1e-05,3.1e-05,3.1e-05,3.1e-05,3.1e-05,3.1e-05,3.1e-05,1.0,3.1e-05,3.1e-05,frappe,2013-06
frappe-tags_2014-01,3.5e-05,3.5e-05,3.5e-05,3.5e-05,3.5e-05,3.5e-05,3.5e-05,1.0,3.5e-05,3.5e-05,frappe,2014-01
frappe-tags_2012-06,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05,5.9e-05,1.0,5.9e-05,5.9e-05,frappe,2012-06
frappe-tags_2015-01,1.8e-05,1.8e-05,1.0,1.8e-05,1.8e-05,1.8e-05,1.8e-05,0.054195,1.8e-05,1.8e-05,frappe,2015-01
frappe-tags_2015-06,1.6e-05,1.6e-05,1.0,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,1.6e-05,frappe,2015-06
frappe-tags_2013-01,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,3.3e-05,1.0,3.3e-05,3.3e-05,frappe,2013-01
web2py-tags_2014-06,3e-06,3e-06,3e-06,1.0,3e-06,3e-06,3e-06,3e-06,3e-06,3e-06,web2py,2014-06


# LDA

In [144]:
import numpy as np
import lda
import lda.datasets

X = lda.datasets.load_reuters()
vocab = lda.datasets.load_reuters_vocab()
titles = lda.datasets.load_reuters_titles()
X.shape

model = lda.LDA(n_topics=20, n_iter=500, random_state=1)
model.fit(X)
topic_word = model.topic_word_  # model.components_ also works
n_top_words = 8
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-n_top_words:-1]
    print('Topic {}: {}'.format(i, ' '.join(topic_words)))

Topic 0: government british minister west group letters party
Topic 1: church first during people political country ceremony
Topic 2: elvis king wright fans presley concert life
Topic 3: yeltsin russian russia president kremlin michael romania
Topic 4: pope vatican paul surgery pontiff john hospital
Topic 5: family police miami versace cunanan funeral home
Topic 6: south simpson born york white north african
Topic 7: order church mother successor since election religious
Topic 8: charles prince diana royal queen king parker
Topic 9: film france french against actor paris bardot
Topic 10: germany german war nazi christian letter book
Topic 11: east prize peace timor quebec belo indonesia
Topic 12: n't told life people church show very
Topic 13: years world time year last say three
Topic 14: mother teresa heart charity calcutta missionaries sister
Topic 15: city salonika exhibition buddhist byzantine vietnam swiss
Topic 16: music first people tour including off opera
Topic 17: church cat

In [148]:
doc_topic = model.doc_topic_
for i in range(10):
    print("{} (top topic: {})".format(titles[i], doc_topic[i].argmax()))
X

0 UK: Prince Charles spearheads British royal revolution. LONDON 1996-08-20 (top topic: 8)
1 GERMANY: Historic Dresden church rising from WW2 ashes. DRESDEN, Germany 1996-08-21 (top topic: 1)
2 INDIA: Mother Teresa's condition said still unstable. CALCUTTA 1996-08-23 (top topic: 14)
3 UK: Palace warns British weekly over Charles pictures. LONDON 1996-08-25 (top topic: 8)
4 INDIA: Mother Teresa, slightly stronger, blesses nuns. CALCUTTA 1996-08-25 (top topic: 14)
5 INDIA: Mother Teresa's condition unchanged, thousands pray. CALCUTTA 1996-08-25 (top topic: 14)
6 INDIA: Mother Teresa shows signs of strength, blesses nuns. CALCUTTA 1996-08-26 (top topic: 14)
7 INDIA: Mother Teresa's condition improves, many pray. CALCUTTA, India 1996-08-25 (top topic: 14)
8 INDIA: Mother Teresa improves, nuns pray for "miracle". CALCUTTA 1996-08-26 (top topic: 14)
9 UK: Charles under fire over prospect of Queen Camilla. LONDON 1996-08-26 (top topic: 8)


array([[1, 0, 1, ..., 0, 0, 0],
       [7, 0, 2, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ..., 
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0],
       [1, 0, 1, ..., 0, 0, 0]], dtype=int32)