In [None]:
import os, time, sys
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import psutil
import warnings
import pickle, datetime
import collections
import re, hashlib
warnings.filterwarnings('ignore')
from google.cloud import storage
import matplotlib.pyplot as plt
import concurrent.futures
from itertools import repeat
import numpy as np
import xlsxwriter
from sklearn.model_selection import KFold
import gc
import inflect
import json
import gcsfs
%matplotlib inline

In [None]:
from PyDictionary import PyDictionary

def isPersonNoun(x, dictionary):
    meanings = dictionary.meaning(x)
    if not meanings or not 'Noun' in meanings:
        return -1
    level = 0
    for meaning in meanings['Noun']:
        meaning = meaning.lower()
        person_definition = ['a human being','a person','someone','somebody','person', 'an individual','a worker']
        for definition in person_definition:
            if meaning.startswith(definition):
                level = 1
                break
        if level == 1:
            break

        words = meaning.split(' ')
        if words[0] == 'a':
            if len(words) > 1 and words[1].find('person') > -1:
                level = 2
            if len(words) > 2 and words[2].find('person') > -1:
                level = 2

        if level == 0:
            if meaning.find('person') > -1:
                level = 3

    return level
                
    
def isPersonNounClass(x, person_nouns, non_person_nouns, cache, dictionary):
    detection_noun = ''
    level = -1
    spl = splitUpper(x)
    for t in spl:
        if len(t) < 3:
            continue
        if t in person_nouns:
            return [1,t]
        
    for t in spl:
        if len(t) < 3:
            continue
        if t in non_person_nouns:
            level = max(0,level)
        else:
            if t in cache:
                cur_level = cache[t]
            else:
                cur_level = isPersonNoun(t, dictionary)
                cache[t] = cur_level
            if cur_level == 1:
                return [1, t]
            elif cur_level == 2:
                level = 2
                detection_noun = t
            elif cur_level == 3 and level != 2:
                level = 3
                detection_noun = t
            
    return [level, detection_noun]



def isPersonFields(data, person_pii_kw_list): 
    level = 0
    pii_fields = []
    for token in data:
        words = set(splitUpper(token))
        if not len(words):
            continue
        if words in person_pii_kw_list:
            pii_fields.append(token)
            level = 1
            
            
    return [level, ', '.join(pii_fields)]
            
        
def loadPersonPiiDetectionModel(person_pii_filename, top_nouns_filename, cache_filename):
    df_person_pii_kw = pd.read_csv(person_pii_filename)
    df_person_pii_kw.fillna('', inplace=True)
    person_pii_kw = {}
    for idx, row in df_person_pii_kw.iterrows():
        coupled = [x.lower().strip('"').strip() for x in row['Coupled'].split(',')]
        if coupled[0] == '':
            coupled = []
        if row['Weights'] == 0 and coupled[0] == '':
            continue
        person_pii_kw[row['PII']] = {}
        person_pii_kw[row['PII']]['Weights'] = row['Weights']
        person_pii_kw[row['PII']]['Coupled'] = coupled
        person_pii_kw[row['PII']]['Standalone'] = row['Standalone']

    person_pii_kw_list = []
    for k,v in person_pii_kw.items():
        if v['Standalone'] > 1:
            person_pii_kw_list.append(set([k]))
        for w in v['Coupled']:
            person_pii_kw_list.append(set([k, w.lower()]))
            
            
    top_nouns = pd.read_csv(top_nouns_filename)
    person_nouns = set(top_nouns[(top_nouns['PersonDetectionLevel']==1) & (top_nouns['Dual']==0)]['Word'])
    non_person_nouns = set(top_nouns[(top_nouns['PersonDetectionLevel']!=1) | (top_nouns['Dual']==1)]['Word'])

    cache = {}
    try:
        cache = pd.read_csv(cache_filename)[['Word','Level']]
        cache.index = cache.Word
        cache.drop('Word',inplace=True, axis=1)
        cache = cache.to_dict()['Level']
    except:
        pass
    
    return person_pii_kw_list, person_nouns, non_person_nouns, cache

def personPiiDetection(df, person_pii_filename, top_nouns_filename, cache_filename):
    dm_person_candidates = df[(df['PredictDataModel'] == 1) | (df['DataModelLabel'] > 0)].index
    if not len(dm_person_candidates):
        return df
    dictionary = PyDictionary()
    for c in ['ClassPersonDetectionLevel', 'ClassPersonDetectionNoun', 'FieldsPersonDetectionLevel', 'FieldsPersonDetectionNames']:
        df[c] = ''
    person_pii_kw_list, person_nouns, non_person_nouns, cache = loadPersonPiiDetectionModel(person_pii_filename, top_nouns_filename, cache_filename)
    df.loc[dm_person_candidates, 'ClassPersonDetectionLevel'],df.loc[dm_person_candidates, 'ClassPersonDetectionNoun'] = zip(*df.loc[dm_person_candidates, 'ClassName'].apply(lambda x: isPersonNounClass(x, person_nouns, non_person_nouns, cache, dictionary)))
    df.loc[dm_person_candidates, 'FieldsPersonDetectionLevel'],df.loc[dm_person_candidates, 'FieldsPersonDetectionNames'] = zip(*df.loc[dm_person_candidates, 'DataFieldNames'].apply(lambda x: isPersonFields(x, person_pii_kw_list)))
    
    pd.DataFrame(cache.items(),columns=['Word','Level']).to_csv('cache_df.csv',index=False)
    
    return df

In [None]:
def load_df(fs, file_path):
    with fs.open(file_path,'r') as f:
        jtable = json.load(f)
    return pd.DataFrame(data=jtable['rows'], columns=jtable['header'])


In [None]:
def processDataMP(model_type, bucket_name, ml_features, extra_features, features_to_length, labelers = [], csv_prefix = 'features.csv', filenames = [], limit = 0):
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    df = ''
    zero_label = 0
    error_parsing = 0
    error_reading = 0
    valid_repo = 0
    corrupted_files = 0
    count = 0
    errors = 0
    success = 0
    df_count = 0
    t = time.time()
    cur_t = time.time()
    
    if limit:
        filenames = filenames[:limit]
        
    df = ''
    fs = gcsfs.GCSFileSystem()
    args = (filenames, 
        repeat(fs, len(filenames)),
        repeat(bucket_name, len(filenames)),
        repeat(features_to_length, len(filenames)),
        repeat(labelers, len(filenames)),
        repeat(model_type, len(filenames)))
    dfs = []
    mem_usage = 0
    file_number = 0
    # Create a pool of processes. By default, one is created for each CPU in your machine.
    with concurrent.futures.ProcessPoolExecutor() as executor:
        for filename, new_df in zip(filenames, executor.map(processFile, *args)):
            if not len(new_df):
                errors += 1
                continue
            mem_usage += new_df.memory_usage(index=True,deep=True).sum()
            dfs.append(new_df)
            success += 1
            
            count += 1
            if count % 1000 == 0:
                process = psutil.Process(os.getpid())
                print(process.memory_info().rss  /1024**3, 'GB process memory usage')
                print(count, round((time.time()-cur_t)/60,1), 'minutes, total = ', round((time.time()-t)/60,1), 'minutes')
                print(file_number, 'Memory usage = :',round(mem_usage/1024**3,1),'GB')
                cur_t = time.time()
                
                mem_usage = 0
                
                file_number += 1
             
    df = pd.concat(dfs,axis=0)
        
    print('Memory usage = :',round(mem_usage/1024**3,1),'GB')
    print(len(df),'rows in df')
    print(count,'repositories in dataset')
    print(success,'successfully processed, ', errors, 'errors')
    df.reset_index(drop=True, inplace=True)
    df.fillna(0,inplace=True)

    if len(labelers):
        for labeler in labelers:
            print(len(df),'classes,', df[labeler[0]].sum(),'classes with positive ',labeler[0],',', round((df[labeler[0]].sum()/len(df))*100.0,2),' % positive labels classes')
            for labeler in extractLabelColumns(df, labeler[1]):
                print(labeler,': ',int(df[labeler].sum()),'positive '+labeler[0]+' classes')
        print()
    print('Total = ', round((time.time()-t)/60,1), 'minutes')
    return df




def _extract_callers_of(method_full_name, method_calls_map):
    method_full_name = method_full_name.split('(')[0]
    return method_calls_map[method_full_name] if method_full_name in method_calls_map else 0

        
def processFile(filename, fs, bucket_name, features_to_length, labelers, model_type):
    try:
        new_df = load_df(fs, 'gs://'+bucket_name+'/'+filename)
        new_df.fillna(0,inplace=True)
    except:
        return ''
    try:
        new_df['Repo'] = new_df['Link'].apply(lambda x: '/'.join(x.strip('https://www.github.com/').strip('http://www.github.com/').split('/')[:2]))
    except:
        return ''
    
    for len_feature in features_to_length:
        new_df[len_feature+'Len'] = new_df[len_feature].apply(lambda x:len(x))


    all_labels_columns = []
    try:
        for labeler in labelers:
            labels_columns = extractLabelColumns(new_df,labeler[1])
            if not len(set(labels_columns).intersection(set(new_df.columns))) == len(labels_columns):
                print('Error - training dataset must conatin all label columns:',labels_columns)
                return ''
            all_labels_columns += labels_columns
            new_df[labeler[0]] = new_df.apply(lambda row: int(row[labels_columns].astype(int).sum() > 0), axis=1)            
    except:
        return ''
    
    if model_type == 'DataModel':
        features_df = ''
        new_df = detectPiis(new_df, features_df) 
        
    if model_type == 'API':
        new_df['InternalClassMethodCalls'].fillna('',inplace=True)
        new_df['ExternalClassMethodCalls'].fillna('',inplace=True)
        new_df['ClassMethodCalls'] = new_df.apply(lambda row: list(set(row['InternalClassMethodCalls'] + row['ExternalClassMethodCalls'])), axis=1)
        
        method_calls_histogram = collections.Counter([item for sublist in new_df['ClassMethodCalls'].values for item in sublist])
        new_df['MethodCalledLen'] = new_df['MethodUniqueName'].apply(lambda method_name: _extract_callers_of(method_name, method_calls_histogram))
    
    
      
    if model_type == 'Test' or model_type == 'TestUtil':
        test_classes = set(new_df[new_df[model_type+'Label']==1]['QualifiedName'].values)
        new_df[model_type+'InnerClass'] = 0
        if len(test_classes):
            new_df[model_type+'InnerClass'] = new_df['ContainedByClasses'].apply(lambda x: 1 if len(set(x).intersection(test_classes)) > 0 else 0)
            
    return new_df


def extractLabelColumns(df, label_str):
    labelers = []
    for c in df.columns:
        if c.startswith(label_str):
            labelers.append(c)
    return labelers




In [None]:
def detectPiis(df, features_df = ''):
    pii_dict = {}
    pii_list = []
    
    df['RawFieldNames PIIs Detection'] = ''
    df['PII Level'] = 2
        
    for idx,row in df.iterrows():
        if len(row['DataFieldNames']) == 0:
            continue
            
        if doFilter(row, pii_model):
            continue
            
        cur_features_df = ''
        if len(features_df):
            cur_features_df = features_df[features_df['ClassName'] == row['ClassName']]
            
        
        fields_piis, fields_piis_matches, fields_num_matches, fields_num_kw_matches, fields_avg_match_len, fields_num_coupled, fields_max_level, fields_scores, fields_piis_coupled, fields_max_similarity_score = findPIIs(row['DataFieldNames'], pii_model, True, cur_features_df)
        
        class_piis, class_piis_matches, class_num_matches, class_num_kw_matches, class_avg_match_len, class_num_coupled, class_max_level, class_scores, class_piis_coupled, class_max_similarity_score = findPIIs('['+str(row['ClassName'])+']', pii_model, False)
        
       

        words = set(splitUpper(row['ClassName']))
        if len(words.intersection(pii_model['pii_class_kw'])) > 0:
            df.loc[idx,'PII Level'] = 1
                
        if fields_num_matches + class_num_matches == 0:
            continue
            
        pii_fields = []
        if fields_max_level > 1 or class_max_level > 1:
            pii_fields = fields_piis_matches
            
            
        df.loc[idx,'RawFieldNames PIIs Detection'] = ' ;'.join(pii_fields)
            
    return df

In [None]:
def doFilter(row, pii_model):
    class_words = set(splitUpper(row['ClassName']))
    if len(class_words.intersection(pii_model['blacklist']['class_name'])) > 0:
        return True
    
    path_words = set([item for sublist in [splitUpper(p) for p in row['Path'].split('/')] for item in sublist])
    
    if len(path_words.intersection(pii_model['blacklist']['path'])) > 0:
        return True

    return False  


def filterField(token, features_df):
    field_features = features_df[features_df['FieldName'] == token]
    if not len(field_features):
        return True
    try:
        if field_features['FieldType'].values[0].lower() == 'boolean' or field_features['IsFinal'].values[0] == 1 or field_features['IsStatic'].values[0] == 1:
            return True
    except:
        print(token,field_features)
        if field_features['FieldType'].values[0].lower() == 'boolean' or field_features['IsFinal'].values[0] == 1 or field_features['IsStatic'].values[0] == 1:
            return True
    return False
    

In [None]:
def splitUpper(s):
    try:
        return [t.replace('_','').replace('.','').lower() for t in re.findall(r'[A-Z_.]?[^A-Z0-9_.\s]+|[A-Z]+', s)]
    except:
        print('Error processing: ', s)
        return[]



def findPIIs(data, pii_model, do_filter, features_df = ''):
    piis = []
    piis_coupled = []
    
    num_matches = 0
    num_coupled = 0
    num_kw_matches = 0
    matches = []
        
    scores = {}
        
    max_level = 0
    max_similarity_score = 0
    for token in data:
        max_token_level = 0
        comb = set([])
        spl = splitUpper(token)
        if not len(spl):
            continue
            
        if do_filter and spl[0] in pii_model['blacklist']['field_prefix']:
            continue
            
        if len(features_df) and filterField(token, features_df):
            continue
        for t in spl:
            if t in pii_model['pii_keywords'].keys() and pii_model['pii_keywords'][t]['Weights'] > 0:
                comb.add(t)
                if pii_model['pii_keywords'][t]['Weights'] > max_token_level:
                    max_token_level = pii_model['pii_keywords'][t]['Weights']
                    
                if pii_model['pii_keywords'][t]['Weights'] > max_level:
                    max_level = pii_model['pii_keywords'][t]['Weights']
                    
                
        comb = list(comb)
        if len(comb) == 0:
            continue
        scores[token] = {}
        scores[token]['similarity_score'] = len(comb) / len(spl)
        scores[token]['max_level'] = max_token_level
        scores[token]['words'] = comb
          
        scores[token]['coupled_words'] = []
        scores[token]['levels_count'] = {}
        for c in comb:
            if not pii_model['pii_keywords'][c]['Weights'] in scores[token]['levels_count']:
                scores[token]['levels_count'][pii_model['pii_keywords'][c]['Weights']] = 0
            scores[token]['levels_count'][pii_model['pii_keywords'][c]['Weights']] += 1
            
            coupled = pii_model['pii_keywords'][c]['Coupled']
            if not len(coupled) or c in scores[token]['coupled_words']:
                continue
            for couple in coupled:
                if couple in comb:
                    scores[token]['coupled_words'].append(c+':'+couple)
                    scores[token]['max_level'] = 3
                    max_level = 3
                    num_coupled += 1
                    
        if scores[token]['similarity_score'] > max_similarity_score:
            max_similarity_score = scores[token]['similarity_score']
        num_kw_matches += len(comb)
        comb.sort()
                
        piis.append(token+','+str(scores[token]['max_level'])+','+str(scores[token]['similarity_score'])+': '+ ','.join(comb))
        if len(scores[token]['coupled_words']):
            piis_coupled.append(token+': '+ ','.join(scores[token]['coupled_words']))
        num_matches += 1
        matches.append(token)
    avg_match_len = 0 if num_matches == 0 else num_kw_matches / num_matches
    return piis, matches, num_matches, num_kw_matches, avg_match_len, num_coupled, max_level, scores, piis_coupled, max_similarity_score


In [None]:
def buildModels(df, features, labelers, DT = False):
    
    models = {}
    for l in labelers:
        label_col = l[0]
        if not label_col in df:
            print('Error - training dataset must contain ',label_col,' feature')
            return ''
        
        models[label_col] = {}
        models[label_col]['Prefix'] = l[1]
        if DT:
            clf = DecisionTreeClassifier()
        else:
            clf = RandomForestClassifier(n_estimators=30, max_features=None)
        models[label_col]['clf'] = clf.fit(df[features], df[label_col])
    return models
    

In [None]:
def predict(df, features, models, logfile = ''):
    for label_col, model_data in models.items():
        df['Predict'+label_col] = model_data['clf'].predict(df[features])
        df['PredictPositiveProbability'+label_col] = model_data['clf'].predict_proba(df[features])[:,-1]
        calcPerformance(df, [(label_col,model_data['Prefix'])], logfile)
        if 'PositiveMD5' in model_data and len(model_data['PositiveMD5']) > 0:
            df['PredictPositiveByMD5'+label_col] = df.apply(lambda row: 1 if (hashlib.md5(str(row[features].astype(float).values).encode('utf-8')).hexdigest() in model_data['PositiveMD5']) else 0, axis=1)
            df['MLPredict'+label_col] = df['Predict'+label_col]
            df['MLPredictPositiveProbability'+label_col] = df['PredictPositiveProbability'+label_col]
            df['Predict'+label_col] = df.apply(lambda row: 1 if row['PredictPositiveByMD5'+label_col] == 1 else row['Predict'+label_col],axis=1)
            df['PredictPositiveProbability'+label_col] = df.apply(lambda row: 1 if row['PredictPositiveByMD5'+label_col] == 1 else row['PredictPositiveProbability'+label_col], axis=1)
            if label_col in df.columns:
                calcPerformance(df, [(label_col,model_data['Prefix'])], logfile)
        model_name = label_col.replace('Label','')
        df['Predict'+model_name] = df['Predict'+label_col]
        df['PredictProba'+model_name] = df['PredictPositiveProbability'+label_col]
    return df

In [None]:
def calcPerformance(df, labelers, logfile = ''):
    try:
        for l in labelers:
            label_col = l[0]
            print(label_col)
            if not label_col in df or not 'Predict'+label_col in df:
                print('Error - dataset must contain ',label_col,' and Predict',label_col,'features')
                return

            tp = sum((df['Predict'+label_col] == 1) & (df[label_col]==1))
            fp = sum((df['Predict'+label_col] == 1) & (df[label_col]==0))
            tn = sum((df['Predict'+label_col] == 0) & (df[label_col]==0))
            fn = sum((df['Predict'+label_col] == 0) & (df[label_col]==1))



            print('TP = ',round(100 * tp / sum(df[label_col]==1),1))
            print('FP = ',round(100 * fp / sum(df[label_col]==0),1))
            print('TN = ',round(100 * tn / sum(df[label_col]==0),1))
            print('FN = ',round(100 * fn / sum(df[label_col]==1),1))
            print('Accuary = ',round(100 * (tp+tn) /len(df),1))

            if len(logfile):
                f = open(logfile,'a')
                f.write('TP = '+ str(round(100 * tp / sum(df[label_col]==1),1)) + '\n')
                f.write('FP = '+ str(round(100 * fp / sum(df[label_col]==0),1)) + '\n')
                f.write('TN = '+ str(round(100 * tn / sum(df[label_col]==0),1)) + '\n')
                f.write('FN = '+ str(round(100 * fn / sum(df[label_col]==1),1)) + '\n')
                f.write('Accuary = ' + str(round(100 * (tp+tn) /len(df),1)) + '\n')
                f.close()
    except:
        print('Error calculating calcPerformance')

In [None]:
def splitUpperV2(s):
    try:
        if not s[0].isupper():
            s = s[0].upper() + s[1:]
        return [t.replace('-','').replace('_','').replace('.','').lower() for t in re.findall(r'[A-Z_.-](?:[A-Z ]+(?![a-z])|[a-z]*)', s)]
    except:
        print('Error processing: ', s)
        return[]


In [None]:
def hasSerialize(words):
    for w in words:
        if w.lower().find('serialize') > -1:
            return 1
    return 0

In [None]:
def findDistinctAnnotations(x):
    s = set([])
    for v in x.values():
        s.update(v)
    return s

In [None]:
def postProcess(df, model_type):
    if model_type == 'API':
        df['org_Visibility'] = df['Visibility']
        df['Visibility'] = \
            df['Visibility'].apply(lambda x: 0 if x == 0 else 1 if x =='protected' else 2 if x =='private' else 3 if x =='public' else x)
        df['ParameterAnnotationsLen'] = df['ParameterAnnotations'].apply(lambda x: len(findDistinctAnnotations(x)))
        
    if model_type == 'DataModel':
        df['HasSerializeFieldAnnotation'] = df['FieldAnnotationsByName'].apply(lambda x: hasSerialize(findDistinctAnnotations(x)))
        df['HasSerializeMethodAnnotation'] = df['MethodAnnotations'].apply(lambda x: hasSerialize(x))
        df['NonDataFieldNamesLen'] = df['RawFieldNamesLen'] - df['DataFieldNamesLen']
        df['FieldAnnotationsByNameLen'] = df['FieldAnnotationsByName'].apply(lambda x: len(findDistinctAnnotations(x)))
        df.loc[df['RawMethodNamesLen'] == 0,'LogicMethodRatio'] = 0
        
    if model_type == 'Test' or model_type == 'TestUtil':
        df['PathWords'] = df['Path'].apply(lambda s: str(list(set([item for sublist in [splitUpperV2(x) for x in s.split('/')] for item in sublist]))))
        df['ClassNameWords'] = df['ClassName'].apply(lambda x: splitUpperV2(x))
        df['ClassAnnotationsWords'] = df['ClassAnnotations']
        df['FieldAnnotationsWords'] = df['FieldAnnotationsByName'].apply(lambda x: list(findDistinctAnnotations(x)))
        df['RawImportsWords'] = df['RawImports']
        df['RawMethodNamesWords'] = df['RawMethodNames']
        df['MethodAnnotationsWords'] = df['MethodAnnotations']


    return df

In [None]:
def loadPiiModel(filename):
    engine = inflect.engine()

    df_pii_model = pd.read_csv(filename)
    df_pii_model.fillna('', inplace=True)
    pii_model = {}
    for idx, row in df_pii_model.iterrows():
        coupled = [x.lower().strip('"').strip() for x in row['Coupled'].split(',')]
        if row['Weights'] == 0 and coupled[0] == '':
            continue
        pii_model[row['PII']] = {}
        pii_model[row['PII']]['Weights'] = row['Weights']
        pii_model[row['PII']]['Coupled'] = coupled
        
        plural_pii = engine.plural(row['PII'])
        pii_model[plural_pii] = {}
        pii_model[plural_pii]['Weights'] = row['Weights']
        pii_model[plural_pii]['Coupled'] = coupled
    return pii_model

In [None]:
def finalizeModel(training_df, model_type, train_db):
    if model_type == 'DataModel':
        pos_df = training_df[training_df[train_db[model_type]['labelers'][0][0]] == 1]
        pos_df['signature'] = pos_df.apply(lambda row: hashlib.md5(str(row[train_db[model_type]['ml_features']].astype(float).values).encode('utf-8')).hexdigest(), axis=1)
        pos_signatures = set(pos_df['signature'].values)
        train_db[model_type]['model'][train_db[model_type]['labelers'][0][0]]['PositiveMD5'] = pos_signatures
    return train_db

In [None]:
def buildTestModelData(training_df, model_type, ml_features, extracted_features_db):
    features_names = list(extracted_features_db.keys())
    data = []
    for idx, row in training_df.iterrows():
        my_ml_features = []
        for f in features_names:
            words = set([w.lower() for w in row[f]])
            shared = list(words.intersection(extracted_features_db[f]['words']))
            my_ml_features += [f+':'+word for word in shared]
            if 'startswith_words' in extracted_features_db[f]:
                words = set(['.'.join(w.lower().split('.')[0:2]) for w in row[f]])
                shared = words.intersection(extracted_features_db[f]['startswith_words'])
                for word in shared:
                    my_ml_features += [f+':'+word for word in shared]
        data.append([idx,row[model_type+'Label'],list(set(my_ml_features))])
    
    
    ind = [d[0] for d in data]
    lab = [d[1] for d in data]
    ml_features_map = {}
    for i in range(len(ml_features)):
        ml_features_map[ml_features[i]] = i
    matrix = np.zeros((len(data),len(ml_features)))
    for i in range(len(data)):
        d = data[i]
        for f in d[2]:
            matrix[i,ml_features_map[f]] = 1

    model_df = pd.DataFrame(matrix, columns=ml_features)
    model_df['Ind'] = ind
    model_df[model_type+'Label'] = lab
    return model_df


In [None]:
pii_model_filename = 'PiiModel.csv'

pii_class_kw = set([x.strip() for x in open('pii_class_1gram.csv').readlines()])

pii_model = {}
pii_model['pii_keywords'] = loadPiiModel(pii_model_filename)
pii_model['blacklist'] = {}
pii_model['blacklist']['field_prefix'] = set(['is', 'has'])
pii_model['blacklist']['field_name'] = set([])
pii_model['blacklist']['class_name'] = set(['test', 'tests', 'testing','service'])
pii_model['blacklist']['path'] = set(['test', 'tests', 'testing'])
pii_model['pii_class_kw'] = pii_class_kw

In [None]:
bucket_name = 'lim-research-features-20'
bucket_folder = ''

test_phase = '#22'
data_model_features_to_length = ['ClassAnnotations',
                                 'MethodAnnotations',
                                 'DataFieldNames', 
                                 'RawFieldNames',
                                 'RawImports',
                                 'RawMethodNames', 
                                 'RawComments', 
                                 'StringLiterals',
                                 'ImplementedInterfaces',
                                 'ExternalClassesMethodCalls', 
                                 'ExternalCoreMethodCalls']

data_model_ml_features = ['IsLeaf',
                          'IsSerializable',
                          'GetterCount', 
                          'HasEquals', 
                          'HasHashCode', 
                          'IsCloneable', 
                          'IsComparable',
                          'HasToString', 
                          'LogicMethodCount', 
                          'SetterCount', 
                          'AverageMethodBodyLen', 
                          'MedianMethodBodyLen', 
                          'StaticMethodsPercentage',
                          'PublicMethodsPercentage', 
                          'ConstructorsCount',
                          'HasIdentifier',
                          'HasIdentifierAnnotation',
                          'LogicMethodRatio',
                          'HasSerializeFieldAnnotation',
                          'HasSerializeMethodAnnotation',
                          'NonDataFieldNamesLen',
                          'FieldAnnotationsByNameLen'] + [f + 'Len' for f in data_model_features_to_length]

        
data_model_labelers = [('DataModelLabel','!ModelLabel!')]

api_features_to_length = ['Annotations',
                          'ReturnType',
                          'ExternalClassMethodCalls',
                          'ExternalCoreMethodCalls',
                          'InternalClassMethodCalls',
                          'ParametersNameToType']

api_ml_features = ['MethodCalledLen',
                   'BodyLength', 
                   'HasJavadoc',
                   'IsGeneric',
                   'IsGetterOrSetter',
                   'HttpMethodAnnotation',
                   'IsStatic', 
                   'PostPutAnnotation', 
                   'RouteLikeAnnotation', 
                   'EndsWithHttpMethod', 
                   'StartsWithHttpMethod',
                   'Visibility',
                   'ParameterAnnotationsLen'] + [f + 'Len' for f in api_features_to_length]

        
api_labelers = [('ApiLabel','!ApiLabel!')]

test_labelers = [('TestLabel','!TestLabel!')]

test_util_labelers = [('TestUtilLabel','!TestUtilLabel!')]

train_db = {}
train_db['DataModel'] = {}
train_db['DataModel']['extra_features'] = ['QualifiedName','ExtendedClass','ClassName','Repo','Path', 'Link']
train_db['DataModel']['ml_features'] = data_model_ml_features
train_db['DataModel']['features_to_length'] = data_model_features_to_length
train_db['DataModel']['labelers'] = data_model_labelers
train_db['DataModel']['csv_prefix'] = 'class_features.json'
train_db['DataModel']['Filenames'] = []

train_db['API'] = {}
train_db['API']['extra_features'] = ['ClassName', 'ClassQualifiedName','MethodName', 'MethodUniqueName','Repo','Path', 'Link','DeclaringClassFullName','InternalMethodCalls','MethodCalledLen']
train_db['API']['ml_features'] = api_ml_features
train_db['API']['features_to_length'] = api_features_to_length
train_db['API']['labelers'] = api_labelers
train_db['API']['csv_prefix'] = 'method_features.json'
train_db['API']['Filenames'] = []

train_db['Test'] = {}
train_db['Test']['extra_features'] = ['ClassName','Repo','Path', 'Link','MethodAnnotations','RawImports','RawMethodNames','ClassAnnotations','FieldAnnotations','ContainedByClasses','QualifiedName','TestInnerClass']
train_db['Test']['ml_features'] = ''
train_db['Test']['features_to_length'] = ''
train_db['Test']['labelers'] = test_labelers
train_db['Test']['csv_prefix'] = 'class_features.json'
train_db['Test']['Filenames'] = []


train_db['TestUtil'] = {}
train_db['TestUtil']['extra_features'] = ['ClassName','Repo','Path', 'Link','MethodAnnotations','RawImports','RawMethodNames','ClassAnnotations','FieldAnnotations','ContainedByClasses','QualifiedName','TestInnerClass']
train_db['TestUtil']['ml_features'] = ''
train_db['TestUtil']['features_to_length'] = ''
train_db['TestUtil']['labelers'] = test_util_labelers
train_db['TestUtil']['csv_prefix'] = 'class_features.json'
train_db['TestUtil']['Filenames'] = []


client = storage.Client()
bucket = client.get_bucket(bucket_name)
for b in bucket.list_blobs():
    if len(bucket_folder) and not b.name.startswith(bucket_folder):
        continue
    if b.name.endswith('method_features.json'):
        train_db['API']['Filenames'].append(b.name)
    elif b.name.endswith('class_features.json'):
        train_db['DataModel']['Filenames'].append(b.name)
        train_db['Test']['Filenames'].append(b.name)
        train_db['TestUtil']['Filenames'].append(b.name)
        
for k,v in train_db.items():
    print(len(v['Filenames']), k ,'files')

In [None]:
# for model_type, model_data in train_db.items():
#     if model_type != 'API':
#         continue
#     print('Training', model_type)
#     training_df = processDataMP(model_type, bucket_name, model_data['ml_features'], model_data['extra_features'], model_data['features_to_length'], model_data['labelers'], model_data['csv_prefix'], model_data['Filenames'], 
#                                 limit = 0)
#     training_df = postProcess(training_df, model_type)   
#     #training_df.to_csv(model_type+'TrainingData'+test_phase+'.csv', index=False) 
#     #training_df = pd.read_csv(model_type+'TrainingData'+test_phase+'.csv')
#     #print('Training file size: ' ,round(os.path.getsize(model_type+'TrainingData'+test_phase+'.csv')/1024**3,2), 'GB')
#     #train_db[model_type]['training_df_file'] = model_type+'TrainingData'+test_phase+'.csv'

#     # Build the ML classification model
#     #m = 'StartsWithPost'
#     #training_df[m] = training_df[m].apply(lambda x: x if not x in ['public', 'private', 'protected'] else 0)
#     train_db[model_type]['model'] = buildModels(training_df, train_db[model_type]['ml_features'], train_db[model_type]['labelers'])
#     train_db = finalizeModel(training_df, model_type, train_db)
#     train_db[model_type]['Filenames'] = []
#     pickle.dump(train_db, open('TrainDB_'+test_phase+'.p','wb'))
# print('Models file size: ' ,round(os.path.getsize('TrainDB_'+test_phase+'.p')/1024**3,2), 'MB')

In [None]:
test_phase = '#22'
db = pickle.load(open('TrainDB_'+test_phase+'.p','rb'))

# Here you should set the testing buckets names
db['API']['bucket'] = 'lim-research-features-20'
db['DataModel']['bucket'] = 'lim-research-features-20'
db['Test']['bucket'] = 'lim-research-features-20'
db['TestUtil']['bucket'] = 'lim-research-features-20'

# set to False if csv files don't contain the labelers columns
has_labelers = True
f = 'log_'+str(datetime.datetime.now()).split('.')[0]+'.txt'
open(f,'w').write('Starting prediction: ' + str(datetime.datetime.now()))
for model_type, model_data in db.items():
    print('Testing', model_type)
    open(f,'a').write('Testing'+ model_type+'\n')
    if not 'model' in model_data:
        print('No model found - skipping ',model_type)
        continue
        
    client = storage.Client()
    bucket = client.get_bucket(model_data['bucket'])
    db[model_type]['Filenames'] = []
    for b in bucket.list_blobs():
        if b.name.endswith(model_data['csv_prefix']):
            db[model_type]['Filenames'].append(b.name)

    print(len(db[model_type]['Filenames']), ' testing files')

    if not has_labelers:
        db[model_type]['labelers'] = []
        
        
        
    testing_df = processDataMP(model_type, 
                               db[model_type]['bucket'],  
                               db[model_type]['ml_features'], 
                               db[model_type]['extra_features'], 
                               db[model_type]['features_to_length'], 
                               db[model_type]['labelers'], 
                               db[model_type]['csv_prefix'], 
                               db[model_type]['Filenames'], 
                               limit = 0)
    testing_df = postProcess(testing_df, model_type)
    
    if model_type in ['API','DataModel']:
        testing_df = predict(testing_df, db[model_type]['ml_features'], db[model_type]['model'],f)
        
    if model_type == 'DataModel':
        testing_df = personPiiDetection(testing_df, 'person_pii.csv', 'top_nouns.csv', 'cache_df.csv')
        
        
        
    if model_type in ['Test','TestUtil']:
        test_model_df = buildTestModelData(testing_df, model_type, db[model_type]['ml_features'], db[model_type]['extracted_features_db'])
        p = predict(test_model_df, db[model_type]['ml_features'], db[model_type]['model'],f)
        testing_df['Predict'+model_type] = p['Predict'+model_type+'Label']
        testing_df['PredictPositiveProbability'+model_type] = p['PredictPositiveProbability'+model_type+'Label']

    testing_df.to_csv(model_type+'TestingData'+test_phase+'.csv', index=False) 
    print()
        
    