In [1]:
!python -VVV

Python 3.7.9 (default, Aug 31 2020, 12:42:55) 
[GCC 7.3.0]


In [2]:
import gzip
import pickle as p
import re
import string
from collections import Counter

import pandas
import pandas as pd
import numpy as np
import redis
import xgboost
from hypy_utils import write
from hypy_utils.tqdm_utils import pmap
from xgboost import XGBClassifier

from main import job_start_redis
from pandas import DataFrame
from datetime import datetime, timezone
from dateutil.relativedelta import relativedelta
import os
from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer


def load_zipped_pickle(filename):
    with gzip.open(filename, 'rb') as f:
        loaded_object = p.load(f)
        l = pd.DataFrame(loaded_object)
        return l


# total changed files
# total commits
# total fix commits
# avaerage changed lines per fix
#
def get_root_nodes(date: str):
    result = sorted([])
    for patch in os.listdir(f"../../data.absolute/{date}/patterns"):
        with open(f"../../data.absolute/{date}/patterns/{patch}", 'r') as f:
            result.append(f.readline())
    return sorted(list(set(result)))


def label_gen() -> DataFrame:
    start_string = "2001-12-01"
    start_date = datetime.strptime(start_string, '%Y-%m-%d')
    interval = relativedelta(months=6)

    csv = []

    data_path = Path('../../data.absolute')
    new = get_root_nodes(start_string)
    new = sorted(list(set(new)))
    added = sorted(list(set(new)))
    remove = sorted([])
    csv.append((start_date.strftime('%Y-%m-%d').split(' ')[0], len(new), len(added), len(remove),
                added, remove, True))

    while True:
        #   end = start + interval
        end_date = start_date + interval
        end_string = end_date.strftime('%Y-%m-%d').split(' ')[0]
        start_string = start_date.strftime('%Y-%m-%d').split(' ')[0]
        if not os.path.isdir(data_path / str(end_string)):
            # new = os.listdir(data_path / f"{end_string}/patterns")
            # added = sorted(list(set(new)))
            # remove = sorted([])
            #  csv.append((end_string, len(new), added, remove))
            break

        new = get_root_nodes(end_string)
        old = get_root_nodes(start_string)

        added = sorted(list(set(new) - set(old)))
        remove = sorted(list(set(old) - set(new)))

        csv.append((end_string, len(new), len(added), len(remove), added, remove, len(added) != 0))

        start_date += interval

    # plt.plot([v[1] for v in csv], [v[2] for v in csv])
    # plt.show()

    df = DataFrame(csv, columns=('Time', 'Number of Patches', 'Numbers of added', 'Numbers of removed', 'Patches Added',
                                 'Patches Removed', 'New Pattern'))
    df.to_csv('diff-test-absolute-root-2.csv')
    return df


def get_commits_sha(start: str, end: str, project_name: str) -> DataFrame:
    start_date = datetime.strptime(start, '%Y-%m-%d').replace(tzinfo=timezone.utc)
    end_date = datetime.strptime(end, '%Y-%m-%d').replace(tzinfo=timezone.utc)
    commits = load_zipped_pickle(f'/workspace/EECS-Research/data.absolute/{end}/commitsDF/{project_name}-fix.pickle.gz')
    return commits[commits['commitDate'].between(start_date, end_date, inclusive=False)]['commit']


In [3]:
labels = label_gen()

date = '2022-06-01'
job_start_redis(f'/workspace/EECS-Research/data.absolute/{date}/redis', 6399)

Shutting down redis 6399...
> Shutdown complete.
/workspace/EECS-Research/data.absolute/2022-06-01/redis
Starting redis 6399...
> Redis started.


In [4]:
# 1. Load AST diffs
r = redis.StrictRedis(host='localhost', port=6399, db=0)
print("Connected to redis!")
ast_diffs = {k.decode(): v.decode() for k, v in r.hgetall('dump').items()}


Connected to redis!


In [5]:
# 2. Load commits
base_path = Path(f'/workspace/EECS-Research/data.absolute/{date}/commitsDF/')
commit_pickles = [base_path / str(f) for f in os.listdir(base_path) if f.endswith('-fix.pickle.gz')]
commits = pandas.concat(pmap(load_zipped_pickle, commit_pickles, desc=f'Loading commits'))

def get_all_commits_sha(start: str, end: str) -> DataFrame:
    start_date = datetime.strptime(start, '%Y-%m-%d').replace(tzinfo=timezone.utc)
    end_date = datetime.strptime(end, '%Y-%m-%d').replace(tzinfo=timezone.utc)
    df = commits[commits['commitDate'].between(start_date, end_date, inclusive=False)]
    return df


Loading commits: 100%|██████████| 42/42 [00:01<00:00, 22.41it/s]


In [6]:
# 3.1. Find sha in each AST diff
AST_KEY_SHA_RE = re.compile(r'[0-9a-f]{6,}_[0-9a-f]{6,}')
ast_diff_sha = {k: AST_KEY_SHA_RE.findall(k) for k in ast_diffs.keys()}
tmp_bf = len(ast_diff_sha)
ast_diff_sha = {k: v[0].split('_') for k, v in ast_diff_sha.items() if v}
print(f'Ignored {len(ast_diff_sha) - tmp_bf} AST diff entries')
print(f'Commit sha lengths: {Counter([len(sha) for l in ast_diff_sha.values() for sha in l])}')

ast_diff_date = {}
for date_start, date_end in zip(labels['Time'][:-1], labels['Time'][1:]):
    # 3.2. Find all commit sha in the time interval
    print(f'Processing time interval from {date_start} to {date_end}')
    commits_sha = {c[:6] for c in get_all_commits_sha(date_start, date_end)['commit']}
    print(f'Total of {len(commits_sha)} commits in the interval')

    # 3.3. Find AST diff entries with commit sha in this interval
    ast_diffs_in_interval = {k for k, s in ast_diff_sha.items() if all(len(sha) == 6 and sha in commits_sha for sha in s)}
    print(f'Total of {len(ast_diffs_in_interval)} AST diffs in the interval')

    # 3.4. Combine AST diffs and save as one file
    ast_combined = '\n\n'.join(ast_diffs[k] for k in ast_diffs_in_interval)
    write(f'ML/ast-diffs/{date_start}.txt', ast_combined)
    ast_diff_date[date_start] = ast_combined


Ignored 0 AST diff entries
Commit sha lengths: Counter({6: 61553, 7: 3748, 8: 237, 9: 14})
Processing time interval from 2001-12-01 to 2002-06-01
Total of 196 commits in the interval
Total of 44 AST diffs in the interval
Processing time interval from 2002-06-01 to 2002-12-01
Total of 207 commits in the interval
Total of 32 AST diffs in the interval
Processing time interval from 2002-12-01 to 2003-06-01
Total of 276 commits in the interval
Total of 27 AST diffs in the interval
Processing time interval from 2003-06-01 to 2003-12-01
Total of 477 commits in the interval
Total of 60 AST diffs in the interval
Processing time interval from 2003-12-01 to 2004-06-01
Total of 920 commits in the interval
Total of 53 AST diffs in the interval
Processing time interval from 2004-06-01 to 2004-12-01
Total of 627 commits in the interval
Total of 75 AST diffs in the interval
Processing time interval from 2004-12-01 to 2005-06-01
Total of 517 commits in the interval
Total of 40 AST diffs in the interval

In [7]:
# 4. Tokenize / Vectorize

# 4.1. Remove symbols, numbers
def tmp_clean(s: str) -> str:
    s = s.replace('@TO@', '').replace('@AT@', '').replace('@LENGTH@', '')
    for c in string.punctuation:
        s = s.replace(c, ' ')
    while '  ' in s:
        s = s.replace('  ', ' ')
    return s
print('Cleaning AST Diffs')
ast_diff_date = {k: tmp_clean(v) for k, v in ast_diff_date.items()}


Cleaning AST Diffs


In [8]:
# 4.2. Vectorize X
tf = TfidfVectorizer(ngram_range=(1, 4), max_features=1000)
X = [ast_diff_date[t] for t in labels['Time'] if t in ast_diff_date]
X = tf.fit_transform(X)
print('TF-IDF Fitting finished, resulting vector shape:', X.shape)


TF-IDF Fitting finished, resulting vector shape: (41, 1000)


In [9]:
# 4.3. Vectorize Y
y = np.array([v != [] for v in labels['Patches Added']][1:])
print('Y vector size:', len(y))

Y vector size: 41


In [10]:
from sklearn.model_selection import train_test_split

# 4.4. Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y)

print(f'Train size: {X_train.shape}, Test size: {X_test.shape}')

Train size: (30, 1000), Test size: (11, 1000)


In [11]:
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import KFold

def print_prec(y_test, y_pred):
    print(f'y Test Label: {np.array([int(x) for x in y_test])}')
    print(f'y Prediction: {np.array([int(x) for x in y_pred])}')

    prec, rec, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='binary')
    print(f'Precision: {prec * 100:0.1f}')
    print(f'Recall:    {rec * 100:0.1f}')
    print(f'F1:        {f1 * 100:0.1f}')

def k_fold(k,model):
    kf=KFold(n_splits=k)
    for train_index, test_index in kf.split(X):
        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train , X_test = X[train_index],X[test_index]
        y_train , y_test = y[train_index] , y[test_index]
        model.fit(X_train,y_train)
        pred_values = model.predict(X_test)

        print_prec(y_test,pred_values)

# 5.1. Gradient Boost Regression Tree model


def gbrt():
    print('Training Gradient Boosted Regression Tree model...')
    classifier: XGBClassifier = XGBClassifier(tree_method='gpu_hist', n_estimators=300)
    classifier.fit(X_train, y_train)
    print('Done!')

    # 5.1. Get internal accuracy
    y_test_pred = classifier.predict(X_test)
    print_prec(y_test, y_test_pred)


gbrt()

Training Gradient Boosted Regression Tree model...
Done!
y Test Label: [1 1 1 1 1 0 1 1 0 0 1]
y Prediction: [1 1 1 0 1 1 1 1 1 0 1]
Precision: 77.8
Recall:    87.5
F1:        82.4


In [13]:
from sklearn.ensemble import RandomForestClassifier


# 5.2. Random Forest Model
def rf():
    print('Training Random Forest model...')
    classifier = RandomForestClassifier(n_estimators=300)
    classifier.fit(X_train, y_train)
    print('Done!')

    # 5.1. Get internal accuracy
    y_test_pred = classifier.predict(X_test)
    print_prec(y_test, y_test_pred)


rf()

Training Random Forest model...
Done!
y Test Label: [1 1 1 1 1 0 1 1 0 0 1]
y Prediction: [1 1 1 0 1 1 1 1 0 0 1]
Precision: 87.5
Recall:    87.5
F1:        87.5


In [14]:
from sklearn.linear_model import LogisticRegression


# 5.3. Logistic Regression Model
def logistic():
    print('Training Logistic Regression model...')
    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)
    print('Done!')

    # 5.1. Get internal accuracy
    y_test_pred = classifier.predict(X_test)
    print_prec(y_test, y_test_pred)


logistic()


Training Logistic Regression model...
Done!
y Test Label: [1 1 1 1 1 0 1 1 0 0 1]
y Prediction: [1 1 1 1 1 1 1 1 1 1 1]
Precision: 72.7
Recall:    100.0
F1:        84.2


In [15]:
print(f'Percentage of 1s in dataset: {sum(y) / len(y) * 100 :.1f}%')


Percentage of 1s in dataset: 63.4%


In [80]:
k_fold(6,LogisticRegression())


TRAIN: [ 7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [0 1 2 3 4 5 6]
y Test Label: [1 1 1 1 1 1 0]
y Prediction: [1 1 1 1 1 1 1]
Precision: 85.7
Recall:    100.0
F1:        92.3
TRAIN: [ 0  1  2  3  4  5  6 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [ 7  8  9 10 11 12 13]
y Test Label: [0 1 1 1 1 1 0]
y Prediction: [1 1 1 1 1 1 1]
Precision: 71.4
Recall:    100.0
F1:        83.3
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [14 15 16 17 18 19 20]
y Test Label: [1 1 0 1 1 1 1]
y Prediction: [1 1 1 1 1 1 1]
Precision: 85.7
Recall:    100.0
F1:        92.3
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [21 22 23 24 25 26 27]
y Test Label: [1 0 1 1 1 0 1]
y Prediction: [1 1 1 1 1 1 1]
Precision: 71.4
Recall:    100.0
F1:        83.3
TRAIN: 

  _warn_prf(average, modifier, msg_start, len(result))


In [82]:
k_fold(6,RandomForestClassifier(n_estimators=300))

TRAIN: [ 7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [0 1 2 3 4 5 6]
y Test Label: [1 1 1 1 1 1 0]
y Prediction: [1 1 1 1 1 1 1]
Precision: 85.7
Recall:    100.0
F1:        92.3
TRAIN: [ 0  1  2  3  4  5  6 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [ 7  8  9 10 11 12 13]
y Test Label: [0 1 1 1 1 1 0]
y Prediction: [1 1 1 1 1 1 1]
Precision: 71.4
Recall:    100.0
F1:        83.3
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [14 15 16 17 18 19 20]
y Test Label: [1 1 0 1 1 1 1]
y Prediction: [1 1 0 1 1 0 1]
Precision: 100.0
Recall:    83.3
F1:        90.9
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [21 22 23 24 25 26 27]
y Test Label: [1 0 1 1 1 0 1]
y Prediction: [1 1 1 1 0 0 0]
Precision: 75.0
Recall:    60.0
F1:        66.7
TRAIN: [

  _warn_prf(average, modifier, msg_start, len(result))


In [83]:
k_fold(6,XGBClassifier(tree_method='gpu_hist', n_estimators=300))


TRAIN: [ 7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [0 1 2 3 4 5 6]
y Test Label: [1 1 1 1 1 1 0]
y Prediction: [0 1 0 0 1 1 1]
Precision: 75.0
Recall:    50.0
F1:        60.0
TRAIN: [ 0  1  2  3  4  5  6 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [ 7  8  9 10 11 12 13]
y Test Label: [0 1 1 1 1 1 0]
y Prediction: [1 1 1 1 0 1 1]
Precision: 66.7
Recall:    80.0
F1:        72.7
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 21 22 23 24 25 26 27 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [14 15 16 17 18 19 20]
y Test Label: [1 1 0 1 1 1 1]
y Prediction: [1 0 0 1 1 1 1]
Precision: 100.0
Recall:    83.3
F1:        90.9
TRAIN: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 28 29 30
 31 32 33 34 35 36 37 38 39 40] TEST: [21 22 23 24 25 26 27]
y Test Label: [1 0 1 1 1 0 1]
y Prediction: [1 1 1 1 0 0 1]
Precision: 80.0
Recall:    80.0
F1:        80.0
TRAIN: [ 0

  _warn_prf(average, modifier, msg_start, len(result))
